def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)])
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert_true(sparse.issparse(X_trans))
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.8)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert sparse.issparse(X_trans)
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
    assert len(col_trans.transformers_) == 2
    assert col_trans.transformers_[-1][0] != 'remainder'

    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.1)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert not sparse.issparse(X_trans)
    assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
    assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
示例#3
0
###############################################################################
# We will perform a 10-fold cross-validation and train the neural-network with
# the two different strategies previously presented.

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

cv_results_imbalanced = []
cv_time_imbalanced = []
cv_results_balanced = []
cv_time_balanced = []
for train_idx, valid_idx in skf.split(X_train, y_train):
    X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx])
    y_local_train = y_train.iloc[train_idx].values.ravel()
    X_local_test = preprocessor.transform(X_train.iloc[valid_idx])
    y_local_test = y_train.iloc[valid_idx].values.ravel()

    elapsed_time, roc_auc = fit_predict_imbalanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test)
    cv_time_imbalanced.append(elapsed_time)
    cv_results_imbalanced.append(roc_auc)

    elapsed_time, roc_auc = fit_predict_balanced_model(X_local_train,
                                                       y_local_train,
                                                       X_local_test,
                                                       y_local_test)
    cv_time_balanced.append(elapsed_time)
    cv_results_balanced.append(roc_auc)

###############################################################################
def analyze_logistic(X,
                     y,
                     model,
                     scale_columns,
                     analyze_params=False,
                     balance_outcomes=False):
    """
    Function for doing analysis of logistic regression. Plots cumulative gain, confusion matrix
    and grid search of optimal learning rate/epochs in SGD with k-fold CV (optional).
    Performs scaling of all continuous features in the data set.

    Inputs:
    - X: design matrix, shape (n, p)
    - y: targets, shape (n,)
    - scale_columns: list of indices of which columns to MinMax scale
    - analyze_params: boolean, option to perform grid search of learning rate and n_epochs in SGD
    - balance_outcomes: boolean, option to balance training data in case of skewed classes
    """

    #split data in train/validate and test
    X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.1)

    #balance training set such that outcomes are 50/50 in training data
    if balance_outcomes:
        non_default_inds = np.where(y_train_val == 0)[0]
        default_inds = np.where(y_train_val == 1)[0]

        remove_size = len(non_default_inds) - len(default_inds)
        remove_inds = np.random.choice(non_default_inds,
                                       size=remove_size,
                                       replace=False)

        X_train_val = np.delete(X, remove_inds, axis=0)
        y_train_val = np.delete(y, remove_inds, axis=0)
    #end if

    #scale continuous features
    minmaxscaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = ColumnTransformer(remainder='passthrough',
                               transformers=[('minmaxscaler', minmaxscaler,
                                              scale_columns)])

    #scale only test data at this point (CV scales training/validation)
    scaler.fit(X_train_val)
    X_test = scaler.transform(X_test)

    if analyze_params:

        #initialize vectors for saving results
        error_scores = pd.DataFrame(
            columns=['log eta', 'n_epochs', 'mse', 'r2', 'accuracy'])
        n_etas = 4
        eta_vals = np.linspace(-1, -4, n_etas)
        n_epoch_vals = np.array([10, 100, 500, 1000])
        n_epochs = len(n_epoch_vals)
        accuracy_scores = np.zeros((n_etas, n_epochs))

        max_accuracy = 0
        best_eta = 0
        best_n_epochs = 0

        #perform grid search of best learning rate
        #and number of epochs with k-fold cross-validation
        i = 0
        for eta in eta_vals:
            model.set_eta(10**eta)

            j = 0
            for epoch in n_epoch_vals:
                model.set_n_epochs(epoch)

                #perform cross validation
                mse, r2, accuracy = CV(X_train_val, y_train_val, model)
                accuracy_scores[i, j] = accuracy

                error_scores = error_scores.append(
                    {
                        'log eta': eta,
                        'n_epochs': epoch,
                        'mse': mse,
                        'r2': r2,
                        'accuracy': accuracy
                    },
                    ignore_index=True)

                #check if current configuration is better
                if accuracy > max_accuracy:
                    max_accuracy = accuracy
                    best_eta = eta
                    best_n_epochs = epoch

                j += 1
                #end for epoch
            i += 1
            #end for eta

        #set optimal model parameters
        model.set_eta(10**best_eta)
        model.set_n_epochs(best_n_epochs)

        #plot heatmap of grid search
        acc_table = pd.pivot_table(error_scores,
                                   values='accuracy',
                                   index=['log eta'],
                                   columns='n_epochs')
        idx_i = np.where(acc_table == max_accuracy)[0]
        idx_j = np.where(acc_table == max_accuracy)[1]

        fig = plt.figure()
        ax = sns.heatmap(acc_table,
                         annot=True,
                         fmt='.2g',
                         cbar=True,
                         linewidths=1,
                         linecolor='white',
                         cbar_kws={'label': 'Accuracy'})

        ax.add_patch(
            Rectangle((idx_j, idx_i), 1, 1, fill=False, edgecolor='red', lw=2))
        ax.set_xlabel('Number of epochs')
        ax.set_ylabel(r'log$_{10}$ of Learning rate')

        bottom, top = ax.get_ylim()
        ax.set_ylim(bottom + 0.5, top - 0.5)
        plt.show()
    #end if

    #scale training data
    X_train_val = scaler.transform(X_train_val)

    #pylearn model
    model.fit(X_train_val, y_train_val)
    pred_train = model.predict(X_train_val)
    pred_test = model.predict(X_test)

    #sklearn model
    clf = linear_model.LogisticRegressionCV()
    clf.fit(X_train_val, y_train_val)
    pred_skl = clf.predict(X_test)

    #get accuracy scores
    accuracy_on_test = accuracy_score(y_test, pred_test)
    accuracy_on_train = accuracy_score(y_train_val, pred_train)
    accuracy_skl = accuracy_score(y_test, pred_skl)

    #predict
    pred_train_prob = model.predict(X_train_val, probability=True)
    pred_test_prob = model.predict(X_test, probability=True)

    #get area ratio and plot cumulaive gain
    area_ratio_train = cumulative_gain_area_ratio(y_train_val,
                                                  pred_train_prob,
                                                  title='Training results')
    area_ratio_test = cumulative_gain_area_ratio(y_test,
                                                 pred_test_prob,
                                                 title=None)
    plt.show()

    #plot confusion matrix
    ax1 = plot_confusion_matrix(y_test,
                                pred_test,
                                normalize=True,
                                cmap='Blues',
                                title=' ')
    ax2 = plot_confusion_matrix(y_train_val,
                                pred_train,
                                normalize=True,
                                cmap='Blues',
                                title='Training data')

    bottom, top = ax1.get_ylim()
    ax1.set_ylim(bottom + 0.5, top - 0.5)
    ax2.set_ylim(bottom + 0.5, top - 0.5)

    plt.show()

    #print some stats
    print('===accuracy and area ratio stats===')
    print('accuracy on test:', accuracy_on_test)
    print('accuracy on train:', accuracy_on_train)
    print('accuracy skl:', accuracy_skl)
    print('area ratio train:', area_ratio_train)
    print('area ratio test:', area_ratio_test)

    if analyze_params:
        print('===grid search stats===')
        print('max accuracy:', max_accuracy)
        print('eta:', best_eta)
        print('n_epochs:', best_n_epochs)
示例#5
0
                                 cv=8,
                                 scoring='accuracy',
                                 n_jobs=-1)
    results = results.append(pd.Series({
        'Model':
        type(model).__name__,
        'Train Score':
        model.fit(train_features, train_labels).score(train_features,
                                                      train_labels),
        'Mean Val Score':
        cv_results.mean()
    }),
                             ignore_index=True)

print(results)

best_model = models[results['Mean Val Score'].argmax()]
print('Best Model selected:', type(best_model).__name__)
best_model.fit(train_features, train_labels)
predictions = best_model.predict(train_features)
plot_confusion_matrix(best_model, train_features, train_labels)
plt.show()
print(classification_report(train_labels, predictions))

incorrect_predictions = train[predictions != train_labels]

test_features = transformer.transform(test[['text1']]).toarray()
predictions = best_model.predict(test_features).astype('int64')

output = pd.DataFrame({'id': test.iloc[:, 0], 'target': predictions})
output.to_csv('output.csv', index=False)
ct = ColumnTransformer([("scaling", StandardScaler(),
                         ['age', 'hours-per-week']),
                        ("onehot", OneHotEncoder(sparse=False),
                         ['workclass', 'education', 'gender', 'occupation'])])

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# get all columns apart from income for the features
data_features = data.drop("income", axis=1)
# split dataframe and income
X_train, X_test, y_train, y_test = train_test_split(data_features,
                                                    data.income,
                                                    random_state=0)

ct.fit(X_train)
X_train_trans = ct.transform(X_train)
print(X_train_trans.shape)
# Result:
# (24420, 44)

logreg = LogisticRegression(solver='lbfgs', max_iter=5000)
logreg.fit(X_train_trans, y_train)

X_test_trans = ct.transform(X_test)
print("\nTest score: {:.2f}".format(logreg.score(X_test_trans, y_test)))
# Result:
# Test score: 0.81

print("\nct.named_transformers_.onehot:\n", ct.named_transformers_.onehot)
# Result:
# ct.named_transformers_.onehot:
示例#7
0
# The problem is solves with the help of Dummy Encoding
onehotencoder1 = OneHotEncoder(categorical_features = [4, 5])
X = onehotencoder1.fit_transform(X).toarray()

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

print(X_train[0])

# Feature Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('one', StandardScaler(), [17, 18, 19, 20])], remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
# y_fnl = []
# for i in range(0, len(y_pred)):
# 	y_fnl.append(int(y_pred[i]))
# 	print(y_pred[i],y_test[i])


#Making the Confusion Matrix
示例#8
0
pred_x = pred_x.drop('Instance', axis=1)

# print(pred_x)

# create imputer for missing values with different strategies for numerical vs categorical data,
# numerical takes the mean, categorical the mode
ct = ColumnTransformer(transformers=[('cat_imp', SimpleImputer(strategy='most_frequent'), [1, 3, 5, 6, 7, 8]),
                                     ('num_imp', SimpleImputer(strategy='median'), [0, 2, 4, 9])],
                       remainder='passthrough')

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(training_x, training_y, shuffle=True, test_size=0.2)

# apply imputer to data
ct.fit(X_train, y_train)
pred_x = ct.transform(pred_x)
X_train = ct.transform(X_train)
X_test = ct.transform(X_test)

# create catboost pool data structures specifying categorical features for both train and test data
pool_train = Pool(X_train, label=y_train, cat_features=[4, 5, 6, 7, 8, 9])
pool_test = Pool(X_test, label=y_test, cat_features=[4, 5, 6, 7, 8, 9])

print("Starting model creation")
# create catboostmodel
model = CatBoostRegressor(cat_features=[4, 5, 6, 7, 8, 9], eval_metric='RMSE', od_type='Iter', od_wait=10,
                          one_hot_max_size=40, task_type="GPU", devices='0', use_best_model=True,iterations=10000,
                          learning_rate=0.01, depth=10, l2_leaf_reg=3, random_strength=4, bagging_temperature=10,
                          border_count=255)
#fit model to data
model.fit(pool_train, eval_set=pool_test, use_best_model=True)
示例#9
0
def test_feature_name_validation():
    """Tests if the proper warning/error is raised if the columns do not match
    during fit and transform."""
    pd = pytest.importorskip("pandas")

    X = np.ones(shape=(3, 2))
    X_extra = np.ones(shape=(3, 3))
    df = pd.DataFrame(X, columns=['a', 'b'])
    df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c'])

    tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])])
    tf.fit(df)

    msg = ("Given feature/column names or counts do not match the ones for "
           "the data given during fit.")
    with pytest.warns(DeprecationWarning, match=msg):
        tf.transform(df_extra)

    tf = ColumnTransformer([('bycol', Trans(), [0])])
    tf.fit(df)

    with pytest.warns(DeprecationWarning, match=msg):
        tf.transform(X_extra)

    with warnings.catch_warnings(record=True) as warns:
        tf.transform(X)
    assert not warns

    tf = ColumnTransformer([('bycol', Trans(), ['a'])], remainder=Trans())
    tf.fit(df)
    with pytest.warns(DeprecationWarning, match=msg):
        tf.transform(df_extra)

    tf = ColumnTransformer([('bycol', Trans(), [0, -1])])
    tf.fit(df)
    msg = "At least one negative column was used to"
    with pytest.raises(RuntimeError, match=msg):
        tf.transform(df_extra)

    tf = ColumnTransformer([('bycol', Trans(), slice(-1, -3, -1))])
    tf.fit(df)
    with pytest.raises(RuntimeError, match=msg):
        tf.transform(df_extra)

    with warnings.catch_warnings(record=True) as warns:
        tf.transform(df)
    assert not warns
示例#10
0
def main(train_X, test_X, train_y, test_y, conf1, conf2, roc_path):

    np.random.RandomState(414)

    warnings.filterwarnings(action='ignore', category=FitFailedWarning)

    # import the already split datasets
    X_train = pd.read_csv(train_X, index_col=0)
    y_train = pd.read_csv(train_y, index_col=0)
    X_test = pd.read_csv(test_X, index_col=0)
    y_test = pd.read_csv(test_y, index_col=0)

    # Test that X_train has more rows the X_test
    try:
        assert (X_train.shape[0] > X_test.shape[0])
    except Exception as bad_size:
        print(
            "X_train should have more rows than X_test.\nDid you put them in the wrong order?"
        )

    # Make validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=414)

    numeric_features = ["age", "result"]

    one_hot_features = [
        "gender", "ethnicity", "jaundice", "country_of_res", "used_app_before",
        "age_desc", "relation", "Class/ASD"
    ]

    other_columns = list(X_train.columns[0:10])

    preprocessor = ColumnTransformer(
        sparse_threshold=0,
        transformers=[("scale", StandardScaler(), numeric_features),
                      ("one_hot",
                       OneHotEncoder(drop=None, handle_unknown="ignore"),
                       one_hot_features)])

    X_train_temp = pd.DataFrame(
        preprocessor.fit_transform(X_train),
        index=X_train.index,
        columns=(numeric_features +
                 list(preprocessor.named_transformers_["one_hot"].
                      get_feature_names(one_hot_features))))

    X_test_temp = pd.DataFrame(preprocessor.transform(X_test),
                               index=X_test.index,
                               columns=X_train_temp.columns)

    X_valid_temp = pd.DataFrame(preprocessor.transform(X_valid),
                                index=X_valid.index,
                                columns=X_train_temp.columns)

    X_train = X_train_temp.join(X_train[other_columns])
    X_test = X_test_temp.join(X_test[other_columns])
    X_valid = X_valid_temp.join(X_valid[other_columns])

    le = LabelEncoder()

    y_train = le.fit_transform(y_train.to_numpy().ravel())
    y_test = le.transform(y_test.to_numpy().ravel())
    y_valid = le.transform(y_valid.to_numpy().ravel())

    ## Trying Gridsearch on different models to find best

    ## Initialize models
    # lr = LogisticRegression()
    dt = DecisionTreeClassifier(random_state=414)
    rf = RandomForestClassifier(random_state=414)
    svm = SVC(random_state=414)
    knn = KNeighborsClassifier()

    # Make list for models and a list to store their values
    estimators = [dt, rf, svm, knn]
    best_parameters = []
    best_precision_scores = []

    # Make list of dictionaries for parameters
    params = [  #{'C':[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        #'penalty': ['l1', 'l2']},
        {
            'max_depth': [1, 5, 10, 15, 20, 25, None],
            'max_features': [3, 5, 10, 15, 20, 50, None]
        },
        {
            'min_impurity_decrease': [0, 0.25, 0.5],
            'max_features': [3, 5, 10, 20, 50, 'auto']
        },
        {
            'C':
            [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'gamma': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
        },
        {
            'n_neighbors': [2, 5, 10, 15, 20, 50, 100],
            'algorithm': ['auto', 'brute']
        }
    ]

    # Run for loop to best parameters for each model
    # Scoring = recall to reduce false positives
    for i in range(len(estimators)):
        search = GridSearchCV(estimator=estimators[i],
                              param_grid=params[i],
                              cv=10,
                              n_jobs=-1,
                              scoring='recall')

        search_object = search.fit(X_train, y_train)

        # Store the output on each iteration
        best_parameters.append(search_object.best_params_)
        best_precision_scores.append(search_object.best_score_)

    best_parameters[np.argmax(best_precision_scores)]

    # the best precision score comes from a decision tree classifier with max_depth=15 and max_features=50
    # and precision = 0.46

    dt = DecisionTreeClassifier(max_depth=15, max_features=50)
    dt.fit(X_train, y_train).score(X_train, y_train)

    # It gets almost perfect on the train set

    dt.score(X_valid, y_valid)

    # and ~81% on the validation set

    prelim_matrix = pd.DataFrame(confusion_matrix(y_valid,
                                                  dt.predict(X_valid)))

    preliminary_matrix = prelim_matrix.rename(columns={
        0: "Predicted no autism",
        1: 'Predicted autism'
    },
                                              index={
                                                  0: "Does not have autism",
                                                  1: 'Has autism'
                                              })

    preliminary_matrix.to_csv(conf1)

    #print(classification_report(y_test, dt.predict(X_test)))

    ## Subset just the questions:

    questions = [
        'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
        'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score'
    ]

    questions_train_df = X_train[questions]

    questions_valid_df = X_valid[questions]

    questions_test_df = X_test[questions]

    # Attribution: Varada Kolhatkar

    class ForwardSelection:
        def __init__(self,
                     model,
                     min_features=None,
                     max_features=None,
                     scoring=None,
                     cv=None):
            """
            Initialize a Forward selection model
            """
            self.max_features = max_features
            if min_features is None:
                self.min_features = 1
            else:
                self.min_features = min_features

            self.model = model
            self.scoring = scoring
            self.cv = cv
            self.ftr_ = []
            return

        def fit(self, X, y):
            """
            Fit a forward selection model        
            """

            error = np.inf
            best = None
            feature_index = list(range(0, (X.shape[1])))
            errors = []

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=514)

            X_temp = X_train

            while error > 0.0:
                if best is not None:
                    if best not in feature_index:
                        del feature_index[-2]
                        break
                    feature_index.remove(best)

                for i in feature_index:
                    self.model.fit(X_temp[:, self.ftr_ + [i]], y_train)
                    temp_error = 1 - np.mean(
                        cross_val_score(
                            self.model, X[:,
                                          self.ftr_ + [i]], y, scoring='f1'))

                    if temp_error < error:
                        error = temp_error
                        best = i

                errors.append(round(error, 3))

                if len(errors) > 2:
                    if errors[-1] >= errors[-2]:
                        break

                if self.max_features is not None:
                    if len(errors) > self.max_features:
                        break

                self.ftr_.append(best)

        def transform(self, X, y=None):
            """
            Transform a test set        
            """
            return X[:, self.ftr_]

    fs = ForwardSelection(DecisionTreeClassifier(), max_features=None)

    fs.fit(questions_train_df.to_numpy(), y_train)

    fs.ftr_

    # No single one question is better than any other one question so forward selection won't work
    # Or it just won't work with a decision tree

    rfe = RFE(DecisionTreeClassifier(), n_features_to_select=5)

    rfe.fit(questions_train_df, y_train)

    # The top 5 questions:

    top_five = np.where(rfe.ranking_ == 1)[0]

    X_train_best_5 = questions_train_df.to_numpy()[:, top_five]
    X_test_best_5 = questions_test_df.to_numpy()[:, top_five]
    X_valid_best_5 = questions_valid_df.to_numpy()[:, top_five]

    dt2 = DecisionTreeClassifier()

    dt2.fit(X_train_best_5, y_train)

    pd.DataFrame(confusion_matrix(y_valid, dt2.predict(X_valid_best_5)))

    # Using just the top 5 questions gets a much worse result than using all the features

    # Try all questions:
    dt3 = DecisionTreeClassifier()

    dt3.fit(questions_train_df, y_train)

    conf_matrix = pd.DataFrame(confusion_matrix(y_test, dt.predict(X_test)))

    final_matrix = conf_matrix.rename(columns={
        0: "Predicted no autism",
        1: 'Predicted autism'
    },
                                      index={
                                          0: "Does not have autism",
                                          1: 'Has autism'
                                      })

    final_matrix.to_csv(conf2)

    # ROC curve

    fpr, tpr, _ = roc_curve(y_test, dt.predict_proba(X_test)[:, 1])

    roc_df = pd.DataFrame({"fpr": fpr, "tpr": tpr})

    line_df = pd.DataFrame({"start": [0, 1], "end": [0, 1]})

    roc = alt.Chart(roc_df).mark_line().encode(x=alt.X("fpr:Q"),
                                               y=alt.Y("tpr:Q"))

    line = alt.Chart(line_df).mark_line(
        strokeDash=[5, 5], color="orange").encode(
            x=alt.X("start:Q", axis=alt.Axis(title="False Positive Rate")),
            y=alt.Y("end:Q", axis=alt.Axis(title="True Positive Rate")))

    chart = (roc + line).configure_axis(titleFontSize=20).properties(
        title="ROC Curve").configure_title(fontSize=20)

    chart

    chart.save(roc_path)
示例#11
0
columns_to_transform = ['second_edu_speci', 'degree_title']
transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),
                                               columns_to_transform)],
                                remainder='passthrough')
X = transformer.fit_transform(dataset.drop(['placement'], axis=1))
y = dataset['placement']

#Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

#Scaling the features
scaler = ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                          [7, 9, 11, 13, 15])],
                           remainder='passthrough')
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Evaluation metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

#Implementing different models

#Naive Bayes - Mean score = 0.80% - Report score = 81%
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()

naive_score = cross_val_score(naive, X_train, y_train, cv=10)
print(np.mean(naive_score))
示例#12
0
naive_sin = []
naive_bin = []
nb_con = GaussianNB()
nb_sin = GaussianNB()
nb_bin = GaussianNB()

for i in semillas:
    X_train_con, X_test_con, Y_train_con, Y_test_con = train_test_split(
        X, Y, random_state=i)
    X_train_sin, X_test_sin, Y_train_sin, Y_test_sin = train_test_split(
        X_sin, Y, random_state=i)
    X_train_bin, X_test_bin, Y_train_bin, Y_test_bin = train_test_split(
        X_dif, Y, random_state=i)

    X_train_con_trans = trans.fit_transform(X_train_con)
    X_test_con_trans = trans.transform(X_test_con)
    X_train_sin_trans = trans.fit_transform(X_train_sin)
    X_test_sin_trans = trans.transform(X_test_sin)
    X_train_bin_trans = trans.fit_transform(X_train_bin)
    X_test_bin_trans = trans.transform(X_test_bin)

    regre_con.fit(X_train_con_trans, Y_train_con)
    regresion_con.append(
        accuracy_score(regre_con.predict(X_test_con_trans), Y_test_con))
    regre_sin.fit(X_train_sin_trans, Y_train_sin)
    regresion_sin.append(
        accuracy_score(regre_sin.predict(X_test_sin_trans), Y_test_sin))
    regre_bin.fit(X_train_bin_trans, Y_train_bin)
    regresion_bin.append(
        accuracy_score(regre_bin.predict(X_test_bin_trans), Y_test_bin))
示例#13
0
filling_indices = [
    x for x in range(len(X_test))
    if X_test[x, -1] != 'S' and X_test[x, -1] != 'Q' and X_test[x, -1] != 'C'
]
X_test[filling_indices, -1] = most_frequent_embarked

embarked_encoder = LabelEncoder()
X_train[:, -1] = embarked_encoder.fit_transform(X_train[:, -1])
X_test[:, -1] = embarked_encoder.transform(X_test[:, -1])

# one hot encoding pclass
ct_pclass = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
    remainder='passthrough')
X_train = ct_pclass.fit_transform(X_train)
X_test = ct_pclass.transform(X_test)
# skipping dummy variable trap
X_train = X_train[:, 1:]
X_test = X_test[:, 1:]

# one hot encoding embarked
ct_embarked = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [7])],
    remainder='passthrough')
X_train = ct_embarked.fit_transform(X_train)
X_test = ct_embarked.transform(X_test)
# skipping dummy variable trap
X_train = X_train[:, 1:]
X_test = X_test[:, 1:]

from sklearn.preprocessing import StandardScaler
def main(input1, input2, output):

    # Read wrangled csv files
    df_train = pd.read_csv(f"./data/{input1}")
    df_test = pd.read_csv(f"./data/{input2}")
    X_train = df_train.drop(['Approved'], 1)
    y_train = df_train[['Approved']]
    X_test = df_test.drop(['Approved'], 1)
    y_test = df_test[['Approved']]

    ## Encoding categorical variables
    categorical_features = [
        'Sex', 'Ethnicity', 'Married', 'BankCustomer', 'EducationLevel',
        'PriorDefault', 'Employed', 'DriversLicense', 'Citizen', 'ZipCode'
    ]
    preprocessor = ColumnTransformer(
        transformers=[('ohe',
                       OneHotEncoder(handle_unknown='ignore', sparse=False),
                       categorical_features)])

    X_train = pd.DataFrame(preprocessor.fit_transform(X_train))
    X_test = pd.DataFrame(preprocessor.transform(X_test))
    y_train = y_train.to_numpy().ravel()
    y_test = y_test.to_numpy().ravel()

    #empty dictionary to store results
    results_dict = {}
    models = {
        'random forest': RandomForestClassifier(),
        'xgboost': XGBClassifier(),
        'lgbm': LGBMClassifier()
    }

    for model_name, model in models.items():
        t = time.time()
        #print(model_name, ":")
        clf = Pipeline(steps=[('classifier', model)])
        clf.fit(X_train, y_train)
        train_score, test_score = get_scores(clf,
                                             X_train,
                                             y_train,
                                             X_test,
                                             y_test,
                                             show=False)
        elapsed_time = time.time() - t
        results_dict[model_name] = [
            round(train_score, 3),
            round(test_score, 3),
            round(elapsed_time, 4)
        ]

    model_compare_dataframe = pd.DataFrame(results_dict)
    model_compare_dataframe.to_csv(f'./{output}/model_compare')

    ### Hyper parameter optimisation for Random Forest
    hyper_parameters = [{
        'n_estimators': [3, 5, 10, 50, 100],
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, 50, None]
    }]

    clf = GridSearchCV(RandomForestClassifier(),
                       hyper_parameters,
                       cv=StratifiedKFold(n_splits=3,
                                          shuffle=True,
                                          random_state=23),
                       verbose=0)
    best_model = clf.fit(X_train, y_train)

    # Measure accuracies
    train_predictions = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_predictions)
    test_predictions = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    test_recall = recall_score(y_test, test_predictions)
    test_precision = precision_score(y_test, test_predictions)
    auc_score = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
    accuracies_df = pd.DataFrame(index=[
        'test accuracy', 'train accuracy', 'test recall', 'test precision',
        'auc score'
    ],
                                 data={
                                     'result': [
                                         test_accuracy, train_accuracy,
                                         test_recall, test_precision, auc_score
                                     ]
                                 })
    accuracies_df.to_csv(f'./{output}/accuracy_report')

    # plot and report confusion matrix
    plot_confusion_matrix(best_model, X_test, y_test)
    report = classification_report(y_test, test_predictions, output_dict=True)
    report_df = pd.DataFrame(report)
    report_df.to_csv(f'./{output}/classification_report')

    # compute and save roc curve
    fpr, tpr, thresholds = roc_curve(y_test,
                                     best_model.predict_proba(X_test)[:, 1])
    plt.plot(fpr, tpr)
    plt.title('ROC report')
    plt.plot((0, 1), (0, 1), '--k')
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.savefig(f'./{output}/roc.png')
示例#15
0
def main():
    # Get and load data
    get_data()
    housing = load_data()
    # display_data(housing)

    # Perform and split by strata
    strat_train_set, strat_test_set = do_stratified_sampling(housing)

    # Using the training set, play with the data
    # play_with_data(strat_train_set.copy())

    # Split data into predictors and labels
    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()

    # Use an imputer to fill in missing values
    # We will fill in these values with the median
    imputer = SimpleImputer(strategy="median")
    # Get dataframe of only numerical vals
    housing_num = housing.drop("ocean_proximity", axis=1)

    # Let the imputer estimate based on the numerical housing vals
    imputer.fit(housing_num)
    # NOTE: The median of each attribute is stored in imputer.statistics_
    # Use trained imputer to fill in gaps by transforming the data
    X = imputer.transform(housing_num)
    # Insert np array into pandas DataFrame
    housing_tr = pd.DataFrame(X,
                              columns=housing_num.columns,
                              index=housing_num.index)

    # Convert categorical attribute to numerical attribute
    housing_cat = housing[["ocean_proximity"]]
    # Use one-hot encoding instead of ordinal encoding
    # as the categories are not ordered.
    cat_encoder = OneHotEncoder()

    # NOTE: This gives a scipy array which stores the location
    # of the "hot" encoding (instead of potentially storing
    # many many "cold" encodings (0's))
    # NOTE: Categories are stored in ordinal_encoder.categories_
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

    # Adding combinational attributes
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)

    # Pipeline for transformations on numerical values
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    housing_num_tr = num_pipeline.fit_transform(housing_num)

    # It is also possible to perform all of the above transformations
    # in one go
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    # This is the final set of training data
    housing_prepared = full_pipeline.fit_transform(housing)

    # Fit the linear regression model on prepared data
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)

    # Do some testing
    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.transform(some_data)
    print("Predictions:", lin_reg.predict(some_data_prepared))
    print("Labels:", list(some_labels))

    # Get metrics
    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    print(lin_rmse)

    # Due to the above results being unsatisfactory
    # Try a decision tree regressor
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(housing_prepared, housing_labels)

    # Now do some testing on the tree regression model
    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    print(tree_rmse)

    # The above testing gives no error
    # Cross validation is performed on 10 folds (training and validating
    # 10 times, choosing a different fold for validation each time
    # and training on the remaining fold)
    scores = cross_val_score(tree_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
    # As cross validation expect to use a utility function instead of a
    # cost function (whereas we want to use a cost function), we must
    # flip the sign of the scores.
    tree_rmse_scores = np.sqrt(-scores)

    # Double check against cross validation on the linear reg. model
    lin_scores = cross_val_score(lin_reg,
                                 housing_prepared,
                                 housing_labels,
                                 scoring="neg_mean_squared_error",
                                 cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)

    print("TREE RSME SCORES")
    display_scores(tree_rmse_scores)

    print("LINEAR REG RMSE SCORES")
    display_scores(lin_rmse_scores)

    # This shows that the Decision Tree is overfitting
    # Therefore we try the Random Forest Regressor
    forest_reg = RandomForestRegressor()
    forest_reg.fit(housing_prepared, housing_labels)
    forest_scores = cross_val_score(forest_reg,
                                    housing_prepared,
                                    housing_labels,
                                    scoring="neg_mean_squared_error",
                                    cv=10)
    forest_rmse_scores = np.sqrt(-forest_scores)

    print("RANDOM FOREST REG RMSE SCORES")
    display_scores(forest_rmse_scores)

    # Fine-tuning by automatically searching for hyperparams
    # Grid indicates to try firstly all permutations of the first dict
    # followed by the permutations of options in the second dict.
    param_grid = [
        {
            "n_estimators": [3, 10, 30],
            "max_features": [2, 4, 6, 8]
        },
        {
            "bootstrap": [False],
            "n_estimators": [3, 10],
            "max_features": [2, 3, 4]
        },
    ]

    forest_reg = RandomForestRegressor()
    # We use five-fold cross validation
    grid_search = GridSearchCV(forest_reg,
                               param_grid,
                               cv=5,
                               scoring="neg_mean_squared_error",
                               return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)

    # The best parameters are found using:
    print(f"Best hyperparams: {grid_search.best_params_}")
    # The best estimator:
    print(f"Best Estimator: {grid_search.best_estimator_}")
    # The evaluation scores:
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    # Examine the relative importance of each attribute for accurate predictions
    feature_importances = grid_search.best_estimator_.feature_importances_
    # Displaying the importance scores next to their attribute names
    extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
    cat_encoder = full_pipeline.named_transformers_["cat"]
    cat_one_hot_attribs = list(cat_encoder.categories_[0])
    attributes = num_attribs + extra_attribs + cat_one_hot_attribs
    print(sorted(zip(feature_importances, attributes), reverse=True))
    # NOTE: The above may indicate which features may be dropped

    # Evaluation on test set
    # Select the best estimator found by the grid search as the final model
    final_model = grid_search.best_estimator_

    # Separate test set into predictors and labels
    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()

    # NOTE: Only transform test data, DO NOT FIT the model on test data
    X_test_prepared = full_pipeline.transform(X_test)

    final_predictions = final_model.predict(X_test_prepared)
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)

    # Compute 95% confidence interval
    confidence = 0.95
    squared_errors = (final_predictions - y_test)**2
    np.sqrt(
        stats.t.interval(confidence,
                         len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

    # The following is inserted into our SelectImportantFeatures'
    # fit method, however we add it here for testing later.
    top_k_feature_indices = top_importances(feature_importances, 5)

    # New pipeline, now reducing the data's features to be
    # restricted to the top 5 most important features
    prep_and_feature_pipeline = Pipeline([
        ("prep", full_pipeline),
        ("feature", SelectImportantFeatures(feature_importances, 5))
    ])

    trimmed_housing = prep_and_feature_pipeline.fit_transform(housing)
    # NOTE: If we were to do trimmed_housing[0:3] and
    # housing_prepared[0:3, top_k_feature_indices],
    # the output would be the same.
    print(trimmed_housing[0:3])
    print(housing_prepared[0:3, top_k_feature_indices])
示例#16
0
categorical_features = list(category_map.keys())
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(
        strategy='median')), ('onehot',
                              OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[(
    'num', ordinal_transformer,
    ordinal_features), ('cat', categorical_transformer, categorical_features)])

# train an RF model
print("Train random forest model")
np.random.seed(0)
clf = RandomForestClassifier(n_estimators=50)
pipeline = Pipeline([('preprocessor', preprocessor), ('clf', clf)])
pipeline.fit(X_train, Y_train)

print("Creating an explainer")
explainer = alibi.explainers.AnchorTabular(
    predictor=lambda x: clf.predict(preprocessor.transform(x)),
    feature_names=feature_names,
    categorical_names=category_map)
explainer.fit(X_train)
explainer.predict_fn = None  # Clear explainer predict_fn as its a lambda and will be reset when loaded

print("Saving individual files")

with open("explainer.dill", 'wb') as f:
    dill.dump(explainer, f)
joblib.dump(pipeline, 'model.joblib')
示例#17
0
class RepeatingBasisFunction(TransformerMixin, BaseEstimator):
    """
    This is a transformer for features with some form of circularity.
    E.g. for days of the week you might face the problem that, conceptually, day 7 is as
    close to day 6 as it is to day 1. While numerically their distance is different.
    This transformer remedies that problem.
    The transformer selects a column and transforms it with a given number of repeating
    (radial) basis functions. Radial basis functions are bell-curve shaped functions
    which take the original data as input. The basis functions are equally spaced over
    the input range. The key feature of repeating basis funtions is that they are
    continuous when moving from the max to the min of the input range. As a result these
    repeating basis functions can capture how close each datapoint is to the center of
    each repeating basis function, even when the input data has a circular nature.

    :type column: int or list, default=0
    :param column: Indexes the data on its second axis. Integers are interpreted as
        positional columns, while strings can reference DataFrame columns by name.

    :type remainder: {'drop', 'passthrough'}, default="drop"
    :param remainder: By default, only the specified column is transformed, and the
        non-specified columns are dropped. (default of ``'drop'``). By specifying
        ``remainder='passthrough'``, all remaining columns will be automatically passed
        through. This subset of columns is concatenated with the output of the transformer.

    :type n_periods: int, default=12
    :param n_periods: number of basis functions to create, i.e., the number of columns that
        will exit the transformer.

    :type input_range: tuple or None, default=None
    :param input_range: the values at which the data repeats itself. For example, for days of
        the week this is (1,7). If input_range=None it is inferred from the training data.
    """
    def __init__(self,
                 column=0,
                 remainder="drop",
                 n_periods=12,
                 input_range=None):
        self.column = column
        self.remainder = remainder
        self.n_periods = n_periods
        self.input_range = input_range
        self.pipeline = None

    def fit(self, X, y=None):
        self.pipeline = ColumnTransformer(
            [(
                "repeatingbasis",
                _RepeatingBasisFunction(n_periods=self.n_periods,
                                        input_range=self.input_range),
                [self.column],
            )],
            remainder=self.remainder,
        )

        self.pipeline.fit(X, y)

        return self

    def transform(self, X):
        check_is_fitted(self, ["pipeline"])
        return self.pipeline.transform(X)
示例#18
0
svm_rmse_scores = np.sqrt(-svm_scores)
print("\nSVM Regression scores (train set): \n")
display_scores(svm_rmse_scores)

tree_rmse_scores = np.sqrt(-tree_scores)
print("\nDT Regression scores (train set): \n")
display_scores(tree_rmse_scores)

forest_rmse_scores = np.sqrt(-forest_scores)
print("\nRF Regression scores (train set): \n")
display_scores(forest_rmse_scores)

#2-5
X_test = strat_test_set.drop("burned_area", axis=1)
y_test = strat_test_set["burned_area"].copy()
X_test_prepared = full_pipeline.transform(X_test)

sgd_scores = cross_val_score(sgd_reg,
                             X_test_prepared,
                             y_test,
                             scoring="neg_mean_squared_error",
                             cv=10)
sgd_rmse_scores = np.sqrt(-sgd_scores)
print("\nSGD Regression scores (test set): \n")
display_scores(sgd_rmse_scores)

svm_scores = cross_val_score(svm_reg,
                             X_test_prepared,
                             y_test,
                             scoring="neg_mean_squared_error",
                             cv=10)
示例#19
0
# Encoding for the Gender Column
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X_train[:, 1] = le.fit_transform(X_train[:, 1])
X_test[:, 1] = le.transform(X_test[:, 1])
fam_test[:, 1] = le.transform(fam_test[:, 1])

# Encoding X categorical data + HotEncoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder', OneHotEncoder(), [-1])],
                       remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train), dtype=np.float)
X_test = np.array(ct.transform(X_test), dtype=np.float)
fam_test = np.array(ct.transform(fam_test), dtype=np.float)

# Avoiding Dummy Variable Trap
X_train = X_train[:, 1:]
X_test = X_test[:, 1:]
fam_test = fam_test[:, 1:]

# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
fam_test = sc_X.transform(fam_test)
示例#20
0
def make_features(transformer: ColumnTransformer, df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(transformer.transform(df))
示例#21
0
    def _preprocessor(self, x, y=None, training=False):
        """ 
        Preprocess input of the network.
          
        Arguments:
            - x {pd.DataFrame} -- Raw input array of shape 
                (batch_size, input_size).
            - y {pd.DataFrame} -- Raw target array of shape (batch_size, 1).
            - training {boolean} -- Boolean indicating if we are training or 
                testing the model.

        Returns:
            - {torch.tensor} -- Preprocessed input array of size 
                (batch_size, input_size).
            - {torch.tensor} -- Preprocessed target array of size 
                (batch_size, 1).

        """

        #######################################################################
        #                       ** START OF YOUR CODE **
        #######################################################################

        # --------------------------------------------------------------------
        # SORT THE DATA
        column_names = [
            'longitude', 'latitude', 'housing_median_age', 'total_rooms',
            'total_bedrooms', 'population', 'households', 'median_income',
            'ocean_proximity'
        ]
        # numerical features
        numeric_features = [
            'longitude', 'latitude', 'housing_median_age', 'total_rooms',
            'total_bedrooms', 'population', 'households', 'median_income'
        ]
        # get the numerical features
        features = x[column_names]

        # --------------------------------------------------------------------
        # HANDLE CATEGORICAL FEATURES
        # Get dummies to transform categorical to Numerical
        features = pd.get_dummies(features)

        # Make sure the features are present in the dataset
        if 'ocean_proximity_ISLAND' not in features.columns.values:
            features['ocean_proximity_ISLAND'] = 0
        elif 'ocean_proximity_NEAR BAY' not in features.columns.values:
            features['ocean_proximity_NEAR BAY'] = 0

        #Drop one column to avoid multicolineariy: 'ocean_proximity_NEAR OCEAN'
        features = features[[
            'longitude', 'latitude', 'housing_median_age', 'total_rooms',
            'total_bedrooms', 'population', 'households', 'median_income',
            'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
            'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY'
        ]]

        outputs = y

        # --------------------------------------------------------------------
        # PRE PROCESSING
        if (training):

            #Imput median value to missing values and rescale
            numeric_transformer = Pipeline(
                steps=[('imputer', SimpleImputer(
                    strategy='median')), ('scaler', StandardScaler())])

            #Transform data to numeric, pass through others
            ct = ColumnTransformer(transformers=[
                ('num', numeric_transformer, numeric_features),
            ],
                                   remainder='passthrough')

            #Processed data transformed
            df_processed = ct.fit_transform(X=features)

            # --------------------------------------------------------------------
            # SAVE MODEL
            #Save  the Transfomer in a pkl file
            dump(ct, open("x_transformer.pkl", "wb"))

            #Transform y -> is probably not necessary
            if y is not None:
                y_scaler = MinMaxScaler()
                outputs = y_scaler.fit_transform(outputs)
                dump(y_scaler, open("y_transformer.pkl", "wb"))

        #If we've seen data before transform with saved preprocessors
        else:

            #Load Column Transformer and Transform data
            ct = load(open('x_transformer.pkl', 'rb'))
            df_processed = ct.transform(features)

            #Load Transformer for y
            if y is not None:
                y_scaler = load(open('y_transformer.pkl', 'rb'))
                outputs = y_scaler.transform(outputs)

        # --------------------------------------------------------------------
        # RETURN AS TENSORS
        x_tensor = torch.tensor(df_processed, dtype=torch.float32)

        # check if y is in the data
        if y is not None:
            y_tensor = torch.tensor(y.values, dtype=torch.float32)

        return x_tensor, (y_tensor if isinstance(y, pd.DataFrame) else None)
示例#22
0
class kickstarter_predictor():
    
    def __init__(self) -> None:
        self._RSEED=42
        self._json_cols=['category', 'location']

        self._cat_features_impute = ['country', 'currency', 'category_name', 'location_type']

        self._cat_features_onehot = ['country', 'currency', 'category_name', 'location_type']

        self.preprocessor = ColumnTransformer(
            transformers=[
            #('cat_impute', SimpleImputer(strategy='constant', fill_value='missing'), self._cat_features_impute),
            ('cat_onehot', OneHotEncoder(handle_unknown='ignore'), self._cat_features_onehot),
            ('untouched', 'passthrough', ['duration','goal_usd', 'launched_at_month', 'created_at_month'])
            #('untouched', 'passthrough', ['deadline','static_usd_rate', 'goal', 'launched_at', 'created_at'])
            ],
            sparse_threshold=0
        )
        self.model = RandomForestClassifier(n_estimators=120, random_state=self._RSEED, max_features = 'sqrt', n_jobs=-1, verbose = 1)
        try:
            mkdir('./output')
        except OSError:
            print ("Creation of the directory output failed.")
 
    def expand_json_cols(self, df):
        """
        Expand columns that contain json objects

        Parameters
        ---------
        df: Pandas DataFrame

        Returns
        --------
        df: Pandas DataFrame
        """
        df_dicts = pd.DataFrame()
        print('---------- Parsing json ------------')
        for col in self._json_cols:
            print('Parsing json: '+col)
            c = []
            for i, val in df[col].items():
                try:
                    c.append(json.loads(val))
                except:
                    c.append(dict())
            df_dicts[col] = pd.Series(np.array(c))
        print('---------- Expanding dictionaries --------')
        df_expanded = []
        for col in df_dicts.columns:
            print('Expanding: '+col)
            df_expanded.append(pd.json_normalize(df_dicts[col]).add_prefix(col+'_'))
        df = pd.concat([df.drop(self._json_cols, axis=1), pd.concat(df_expanded, axis=1)], axis=1)
        return df

    def data_cleaning(self, df):
        """
        Filter data frame by relevant columns and rows.

        Parameters
        ---------
        df: Pandas DataFrame

        Returns
        --------
        df: Pandas DataFrame
        """
        self.base_features = ['country', 'currency', 'category_name', 'location_type', 'goal', 
                    'launched_at', 'created_at', 'blurb', 'state', 'deadline', 'static_usd_rate']
        df = df[self.base_features]

        #df.dropna(inplace=True)

        df = df.query("state == 'successful' or state == 'failed'")
        dic = {'successful' : 1, 'failed' : 0}
        df['state'] = df['state'].map(dic)

        return df
    
    def feature_engineering(self, df):
        """
        Add custom features

        Parameters
        ---------
        df: Pandas DataFrame

        Returns
        --------
        df: Pandas DataFrame
        """
        df['duration'] = (df.deadline-df.launched_at)/(3600*24)
        df['duration'] = df['duration'].round(2)
        df.drop(['deadline'], axis=1, inplace=True)

        df['goal_usd'] = df['goal'] * df['static_usd_rate']
        df['goal_usd'] = df['goal_usd'].round(2)
        df.drop(['static_usd_rate', 'goal'], axis=1, inplace=True)

        df['launched_at_full'] = pd.to_datetime(df['launched_at'], unit='s')
        df['launched_at_month'] = pd.DatetimeIndex(df['launched_at_full']).month
        df.drop(['launched_at', 'launched_at_full'], axis=1, inplace=True)

        df['created_at_full'] = pd.to_datetime(df['created_at'], unit='s')
        df['created_at_month'] = pd.DatetimeIndex(df['created_at_full']).month
        df.drop(['created_at', 'created_at_full'], axis=1, inplace=True)

        df['blurb_len'] = [(x.split(" ") if isinstance(x, str) else "") for x in df.blurb]
        df['blurb_len'] = [len(i) for i in df['blurb_len']]
        df.drop(['blurb'], axis=1, inplace=True)
        return df

    def read_csv(self, name):
        """
        Read csv file in kickstarter format

        Parameters
        ---------
        name: String. Only for display purposes.

        Returns
        --------
        df: Pandas DataFrame
        """
        file_name = input(f"Please enter {name} csv file name: ")
        if(not file_name): 
            file_name = './data/Kickstarter003.csv'
            print(f'Taking default file {file_name}')
        return pd.read_csv(file_name)
    
    def processor_lossy(self, df):
        """
        Apply data frame preprocessing. Outside of sklearn.pipeline

        Parameters
        ---------
        df: Pandas DataFrame

        Returns
        --------
        df: Pandas DataFrame
        """
        df = self.expand_json_cols(df)
        df = self.data_cleaning(df)
        X = df.drop('state', axis=1)
        y = df.state
        return X, y
   
    def dump_model(self): 
        #r = f"{rmse_score_final:.0f}".replace('.','') 
        t = datetime.now().strftime("%Y-%m-%d_%H%M%S")
        o = f"./output/model_dump_{t}.pickle"
        print(f'Dumping model to pickle: {o}')
        dump(self.model, o)

    def model_fit_and_export(self):
        """
        Wrapper for fit and export tasks

        Parameters
        ---------
        None

        Returns
        --------
        None
        """
        df = self.read_csv('train')
        self.X_train, self.y_train = self.processor_lossy(df)
        self.X_train = self.feature_engineering(self.X_train)
        self.X_train = self.preprocessor.fit_transform(self.X_train)
        self.model.fit(self.X_train, self.y_train)
        self.dump_model()

    def model_load(self):
        """
        Load model from pickle file and store in class attribute.

        Parameters
        ---------
        None

        Returns
        --------
        None
        """
        model_file_name = ''
        model_file_name = input('Please enter model file name: ')
        if(model_file_name):
            self.model=load(model_file_name)
        else:
            print('Taking previously trained model.')

    def printscore(self): 
        print(classification_report(self.y_test, self.y_pred)) 
    
    def prediction_tocsv(self):
        #r = f"{rmse_score_final:.0f}".replace('.','') 
        t = datetime.now().strftime("%Y-%m-%d_%H%M%S")
        o = f"./output/y_pred_{t}.csv"
        print(f'Writing prediction to csv: {o}')
        pd.DataFrame(self.y_pred).to_csv(o, index = False)
    
    def readcsv_and_predict(self):
        """
        Wrapper for read and predict tasks

        Parameters
        ---------
        None

        Returns
        --------
        None
        """
        df = self.read_csv('test')
        self.X_test, self.y_test = self.processor_lossy(df)  
        self.X_test = self.feature_engineering(self.X_test)
        self.X_test = self.preprocessor.transform(self.X_test)
        self.y_pred = self.model.predict(self.X_test)
        self.printscore()
        self.prediction_tocsv()
示例#23
0
df = pd.read_csv('train.csv')
df_sub = pd.read_csv('test.csv')
case_id = df_sub.id
df_sub = df_sub.drop(['id'], axis=1)

X = df.iloc[:, 1:11].values
y = df.iloc[:, 11].values
X_sub = df_sub.iloc[:, :].values

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ohe = OneHotEncoder()
ctX = ColumnTransformer([('X', ohe, [0, 5, 6])], remainder='passthrough')
X = ctX.fit_transform(X)
X_sub = ctX.transform(X_sub)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X[:, [7, 11, 13]] = sc.fit_transform(X[:, [7, 11, 13]])
X_sub[:, [7, 11, 13]] = sc.transform(X_sub[:, [7, 11, 13]])

neg, pos = np.bincount(y)
total = neg + pos
w0 = (1 / neg) * (total) / 2
w1 = (1 / pos) * (total) / 2
#weights = {0: w0, 1: w1}
weights = [w0, w1]

from imblearn.over_sampling import RandomOverSampler, SMOTE
示例#24
0
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])
    categorical_features = features_train.select_dtypes(
        include=['category']).columns
    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, numeric_features
                       ), ('cat', categorical_transformer,
                           categorical_features)])
    features_train = preprocessor.fit_transform(features_train)
    features_test = preprocessor.transform(features_test)
    print(features_train.shape)

# Train the model.
if args.library == 'pytorch':
    from pytorch_linear import LinearRegressor
    model = LinearRegressor(batch_size=1000, n_epochs=1, learning_rate=0.01)
    model.fit(features_train, labels_train)
elif args.library == 'sklearn':
    from sklearn.linear_model import SGDRegressor
    model = SGDRegressor(
        max_iter=1,
        eta0=0.01,
        learning_rate='constant',
        tol=None,
    )
示例#25
0
def predict(fn_code, inputVal, predictionVal):
    if fn_code == 1:
        return fib_exp_regression(predictionVal)
    elif fn_code == 3:
        return merge_sort_poly_regression(predictionVal)
    else:
        return linear_regression(predictionVal)
    actual = []
    df_train = pd.read_csv("./final.csv", header=0, index_col=False)
    actual = df_train["tottime"].tolist()
    df_train, df_test = train_test_split(df_train,
                                         test_size=0.1,
                                         random_state=100)

    target = ["tottime"]
    numeric_features = ["ncalls"]
    all_features = numeric_features

    imputers = [("numeric", SimpleImputer(strategy="median"), numeric_features)
                ]

    impute_transformer = ColumnTransformer(transformers=imputers)
    impute_transformer.fit(df_train)

    df_train_imp = pd.DataFrame(impute_transformer.transform(df_train),
                                index=df_train.index,
                                columns=all_features)
    df_test_imp = pd.DataFrame(impute_transformer.transform(df_test),
                               index=df_test.index,
                               columns=all_features)

    feature_transformers = [('scale', StandardScaler(), numeric_features)]

    feature_preprocessor = ColumnTransformer(transformers=feature_transformers)
    feature_preprocessor.fit(df_train_imp)

    X_train_imp_encode = feature_preprocessor.transform(df_train_imp)
    X_test_imp_encode = feature_preprocessor.transform(df_test_imp)

    df_train_imp_encode = pd.DataFrame(X_train_imp_encode,
                                       index=df_train_imp.index,
                                       columns=numeric_features)
    df_test_imp_encode = pd.DataFrame(X_test_imp_encode,
                                      index=df_test_imp.index,
                                      columns=numeric_features)

    y_train = df_train["tottime"]
    y_test = df_test["tottime"]

    # print(X_train_imp_encode.shape)
    # print("===========")
    # print(y_train.shape)

    lr = LinearRegression()
    lr.fit(X_train_imp_encode, y_train)

    predictions = []

    # now predict can be used with lr
    for x in range(inputVal + 1, predictionVal + 1):
        predictions.extend(lr.predict([[x]]))

    return {"predictions": predictions, "actual": actual}
示例#26
0
                                                    data["target"],
                                                    shuffle=False,
                                                    test_size=0.20)

cat = [t != "int64" for t in x_train.dtypes]
num = [t == "int64" for t in x_train.dtypes]

cat_names = x_train.columns[cat]
num_names = x_train.columns[num]

transformer = ColumnTransformer(
    [("num", StandardScaler(), num),
     ("cat", OneHotEncoder(handle_unknown="ignore"), cat)], )

x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

cat_names = transformer.transformers_[1][1].get_feature_names(cat_names)

all_feature_names = list(num_names)
all_feature_names.extend(cat_names)

model = XGBClassifier(max_depth=5,
                      n_estimators=100,
                      min_child_weight=3,
                      colsample_bytree=0.68,
                      subsample=0.63)

model.fit(x_train,
          y_train,
          eval_set=[(x_train, y_train), (x_test, y_test)],
def main(file):
    facts = pd.read_csv(file, encoding='unicode_escape')

    # drop useless features
    facts = facts.drop(['_unit_id'], axis=1)
    facts = facts.drop(["_golden"], axis=1)
    facts = facts.drop(["_last_judgment_at"], axis=1)

    facts = facts.drop(["tweet_created"], axis=1)
    facts = facts.drop(["tweet_id"], axis=1)
    facts = facts.drop(["tweet_location"], axis=1)
    facts = facts.drop(["name"], axis=1)
    facts = facts.drop(["airline_sentiment_gold"], axis=1)

    # label encoding
    from sklearn.preprocessing import LabelEncoder, StandardScaler
    labelencoder = LabelEncoder()
    facts['label'] = labelencoder.fit_transform(facts['airline_sentiment'])
    # check label
    print('2 is', labelencoder.inverse_transform([2]))
    print('1 is', labelencoder.inverse_transform([1]))
    print('0 is ', labelencoder.inverse_transform([0]))
    # drop label
    facts = facts.drop(["airline_sentiment"], axis=1)

    # all useful features
    category_fea = [
        '_unit_state', 'airline', 'user_timezone', 'negativereason',
        'negativereason_gold'
    ]
    num_fea = [
        '_trusted_judgments', 'airline_sentiment:confidence',
        'negativereason:confidence', 'retweet_count'
    ]
    text_fea = ['text']

    # fill all null values
    # fill numerical features with median
    median1 = facts['negativereason:confidence'].median()
    facts['negativereason:confidence'] = facts[
        'negativereason:confidence'].fillna(median1)
    median2 = facts['airline_sentiment:confidence'].median()
    facts['airline_sentiment:confidence'] = facts[
        'airline_sentiment:confidence'].fillna(median2)
    # fill categorical features with the most common words
    common = facts['user_timezone'].mode()
    facts['user_timezone'] = facts['user_timezone'].fillna(
        'Eastern Time (US & Canada)')
    # no negation reasons, then fill nothing
    facts['negativereason'] = facts['negativereason'].fillna('none')
    facts['negativereason_gold'] = facts['negativereason_gold'].fillna('none')

    facts = facts.drop(["tweet_coord"], axis=1)

    # split
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(facts, test_size=0.2, random_state=42)
    simplefilter(action='ignore', category=FutureWarning)
    # bag of words
    cvec = CountVectorizer(
        lowercase=False,
        ngram_range=(1, 2),
        # vocabulary=whitelist,   # You can work with your own whitelist
        max_features=
        5000,  # Or work with the top 1000 most frequent items, or...
        token_pattern=
        u"(?u)\\b\\S+\\b",  # Use these settings if you want to keep punctuation
        analyzer="word")

    cvec.fit(train['text'])

    # preprocessing
    num_fea = [
        '_trusted_judgments', 'airline_sentiment:confidence',
        'negativereason:confidence', 'retweet_count'
    ]
    num_fea_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    category_fea = [
        '_unit_state', 'airline', 'user_timezone', 'negativereason'
    ]
    categorical_transformer = Pipeline(
        steps=[('onehot',
                OneHotEncoder(handle_unknown='ignore'))  # For SVM and similar
               # ('ordinal', OrdinalEncoder())  # For Trees/Gradient Boosting
               ])

    text_fea = ['text']
    text_transformer = ColumnTransformer(transformers=[
        ('count', cvec, "text"),
    ])

    preprocessor = ColumnTransformer(transformers=[(
        'sca', num_fea_transformer,
        num_fea), ('text', text_transformer,
                   text_fea), ('cat', categorical_transformer, category_fea)])

    # encoded train and test data
    encoded = preprocessor.fit_transform(train)
    X_test = preprocessor.transform(test)

    # evaluate models
    # XGBOOST
    from sklearn.metrics import classification_report

    # boost_model = xgb.XGBClassifier().fit(encoded, train['label'])
    # X_test = preprocessor.transform(test)
    # preds = boost_model.predict(X_test)

    # print('classification report for xgboost:\n', classification_report(test["label"], preds))

    # boosting
    from sklearn.ensemble import GradientBoostingClassifier
    gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb_clf.fit(encoded, train['label'])
    predict_gb = gb_clf.predict(X_test)
    print('classification report for gradient boost:\n',
          classification_report(test["label"], predict_gb))

    # Logitsic regression
    from sklearn.linear_model import LogisticRegression
    lg = LogisticRegression()
    lg.fit(encoded, train['label'])
    predict_lg = lg.predict(X_test)
    print('classification report for logistic regression :\n',
          classification_report(test["label"], predict_lg))

    # pasting
    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier
    bag_clf = BaggingClassifier(LogisticRegression(multi_class='auto',
                                                   solver='lbfgs',
                                                   max_iter=10000),
                                n_estimators=100,
                                max_samples=int(np.ceil(0.6 *
                                                        encoded.shape[0])),
                                bootstrap=False,
                                n_jobs=3,
                                random_state=42)
    bag_clf.fit(encoded, train['label'])
    pred_bag = bag_clf.predict(X_test)
    print('classification report for pasting(Logistic regression):\n',
          classification_report(test["label"], pred_bag))

    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier
    bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                                n_estimators=100,
                                max_samples=int(np.ceil(0.6 *
                                                        encoded.shape[0])),
                                bootstrap=False,
                                n_jobs=3,
                                random_state=42)
    bag_clf.fit(encoded, train['label'])
    pred_bag = bag_clf.predict(X_test)
    print('classification report for pasting(Decision Tree Classifier):\n',
          classification_report(test["label"], pred_bag))
示例#28
0
                      gs.cv_results_['params']):
    results[np.sqrt(-ms)] = params

# #### c. 테스트

# In[82]:

final = gs.best_estimator_

# In[83]:

strat_test_set.head()

# In[88]:

x_test = full_pipeline.transform(
    strat_test_set.drop('median_house_value', axis=1))
y_test = strat_test_set['median_house_value'].copy()

# In[98]:

a = final.predict(x_test)

# In[100]:

final

# In[99]:

from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(a, y_test))
示例#29
0
'''Scaling and transforming data'''
print("--- Scaling and transforming data ---\n")
numerical_X_train = X_train.drop(['PULocationID', 'DOLocationID'], axis=1)
num_attribs_X_train = list(numerical_X_train)
cat_attribs_X_train = ['PULocationID', 'DOLocationID']

numerical_X_test = X_test.drop(['PULocationID', 'DOLocationID'], axis=1)
cat_attribs_X_test = ['PULocationID', 'DOLocationID']

scale_transform = ColumnTransformer([
    ('scaler', StandardScaler(), num_attribs_X_train),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs_X_train)
])

X_train_prepared = scale_transform.fit_transform(X_train)
X_test_prepared = scale_transform.transform(X_test)
'''Linear regression'''
# print("--- Calculating Linear Regression ---\n")
# lin_reg = LinearRegression()

# print("\n\tCross validation with RMSE for Linear Regression:")
# lin_reg_scores_nMSE = cross_val_score(lin_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
# lin_reg_rmse_scores = np.sqrt(-lin_reg_scores_nMSE)
# print("\nScores RMSE:\n", lin_reg_rmse_scores)
# print("\nMean score RMSE:\n", lin_reg_rmse_scores.mean())
# print("\nStandard deviation:\n", lin_reg_rmse_scores.std())
#
# print("\n\tCross validation with MAE for Linear Regression:")
# lin_reg_scores_MAE = cross_val_score(lin_reg, X_train_prepared, y_train, scoring="neg_mean_absolute_error", cv=10, n_jobs=-1)
# print("\nScores MAE:\n", lin_reg_scores_MAE)
# print("\nMean score MAE:\n", lin_reg_scores_MAE.mean())
示例#30
0
housing_prepared

housing_prepared.shape

# Linear Regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

# let's try the full preprocessing pipeline on a few training instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

# Compare predictions with actual data
print("Labels:", list(some_labels))

# Measure model's RMSE on whole training set

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_mse = np.sqrt(lin_mse)
lin_mse
示例#31
0
def examples():
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.decomposition import PCA
    estimators = [('reduce_dim', PCA()), ('clf', SVC())]
    pipe = Pipeline(estimators)
    print(pipe)
    print(pipe.steps[0])
    print(pipe.named_steps['reduce_dim'])

    pipe.set_params(clf__C=10)
    print(pipe.named_steps['clf'])

    ###################################################
    # 网格搜索,搜索管道中的参数(重要)
    from sklearn.model_selection import GridSearchCV
    param_grid = dict(reduce_dim__n_components=[2, 5, 10],
                      clf__C=[0.1, 10, 100])
    grid_search = GridSearchCV(pipe, param_grid=param_grid)
    print(grid_search)

    ###################################################
    # 网格搜索,搜索管道中的参数(重要)
    from sklearn.linear_model import LogisticRegression

    param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
                      clf=[SVC(), LogisticRegression()],
                      clf__C=[0.1, 10, 100])  # 多个可组成列表
    grid_search = GridSearchCV(pipe, param_grid=param_grid)
    print(grid_search)

    ###################################################
    from sklearn.pipeline import make_pipeline
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.preprocessing import Binarizer
    pipe = make_pipeline(Binarizer(), MultinomialNB())
    print(pipe)

    ###################################################
    # 利用memory减少重复计算
    from tempfile import mkdtemp
    from shutil import rmtree
    from sklearn.decomposition import PCA
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline
    estimators = [('reduce_dim', PCA()), ('clf', SVC())]
    cachedir = mkdtemp()
    pipe = Pipeline(estimators, memory=cachedir)
    print(pipe)

    # Clear the cache directory when you don't need it anymore
    rmtree(cachedir)

    #####################################################
    #  Transforming target in regression
    import numpy as np
    from sklearn.datasets import load_boston
    from sklearn.compose import TransformedTargetRegressor
    from sklearn.preprocessing import QuantileTransformer
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    boston = load_boston()
    X = boston.data
    y = boston.target
    transformer = QuantileTransformer(output_distribution='normal')
    regressor = LinearRegression()
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    regr.fit(X_train, y_train)

    print('R2 score: {0:.2f}'.format(regr.score(X_test, y_test)))

    raw_target_regr = LinearRegression().fit(X_train, y_train)
    print('R2 score: {0:.2f}'.format(raw_target_regr.score(X_test, y_test)))

    ##########################################################
    # 对每列数据进行处理-预处理
    import pandas as pd
    X = pd.DataFrame({
        'city': ['London', 'London', 'Paris', 'Sallisaw'],
        'title': [
            "His Last Bow", "How Watson Learned the Trick", "A Moveable Feast",
            "The Grapes of Wrath"
        ],
        'expert_rating': [5, 3, 4, 5],
        'user_rating': [4, 5, 4, 3]
    })

    from sklearn.compose import ColumnTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    column_trans = ColumnTransformer(
        [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
         ('title_bow', CountVectorizer(), 'title')],
        remainder='drop')

    print(column_trans.fit(X))
    print(column_trans.get_feature_names())
    print(column_trans.transform(X).toarray())
示例#32
0
        )
    ],
    remainder="passthrough",
)
# %%
train_x = ct.fit_transform(train_x)
clf = DecisionTreeClassifier()

clf.fit(train_x, train_y)


# %%
evaluate(clf, train_x, train_y)

# %%
test_x = test[selected_columns]
test_x["Cabin"] = test_x["Cabin"].fillna("NA")
test_x["Embarked"] = test_x["Embarked"].fillna("NA")
test_x["Age"] = test_x["Age"].fillna(ave_age)
test_x["Fare"] = test_x["Fare"].fillna(train["Fare"].mean())

# %%
test_x = ct.transform(test_x)
pred = clf.predict(test_x)

# %%
test_y = truth["Survived"]
evaluate(clf, test_x, test_y)

# %%
###############################################################################
# We will perform a 10-fold cross-validation and train the neural-network with
# the two different strategies previously presented.

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

cv_results_imbalanced = []
cv_time_imbalanced = []
cv_results_balanced = []
cv_time_balanced = []
for train_idx, valid_idx in skf.split(X_train, y_train):
    X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx])
    y_local_train = y_train.iloc[train_idx].values.ravel()
    X_local_test = preprocessor.transform(X_train.iloc[valid_idx])
    y_local_test = y_train.iloc[valid_idx].values.ravel()

    elapsed_time, roc_auc = fit_predict_imbalanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test)
    cv_time_imbalanced.append(elapsed_time)
    cv_results_imbalanced.append(roc_auc)

    elapsed_time, roc_auc = fit_predict_balanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test)
    cv_time_balanced.append(elapsed_time)
    cv_results_balanced.append(roc_auc)

###############################################################################
# Plot of the results and computation time
###############################################################################