Пример #1
0
data["class"] = np.where(data["trending_time"]<=3., 1, 0.)
#Data
X = np.array(data_norm[["likes", "dislikes", "views", "comment_count", "trending_time"]])
y = np.array(data["class"]).squeeze()
attributeNames = ["likes", "dislikes", "views", "comment_count", "trending_time"]
N, M = X.shape


#Bruger K-fold crossvalidation til at estimatere antallet af naboer k, til k-nearest neighbour classifier

#k-fold cross validation with classifier 

K = 10

CV = model_selection.KFold(K, shuffle=True)
L=40 # Maximum number of neighbors


errors = np.zeros((N,L))
K = 10
internal_cross_validation = 10

k1=0
for train_index, test_index in CV1.split(X,y):
    
    # extract training and test set for current CV fold
    Xo_train = X[train_index,:]
    yo_train = y[train_index]
    Xo_test = X[test_index,:]
    yo_test = y[test_index]
Пример #2
0
import sklearn.linear_model as lm
from sklearn import model_selection
from toolbox_02450 import feature_selector_lr, bmplot
import numpy as np

# Load data from matlab file
mat_data = loadmat('../Data/body.mat')
X = mat_data['X']
y = mat_data['y'].squeeze()
attributeNames = [name[0] for name in mat_data['attributeNames'][0]]
N, M = X.shape

## Crossvalidation
# Create crossvalidation partition for evaluation
K = 5
CV = model_selection.KFold(n_splits=K, shuffle=True)

# Initialize variables
Features = np.zeros((M, K))
Error_train = np.empty((K, 1))
Error_test = np.empty((K, 1))
Error_train_fs = np.empty((K, 1))
Error_test_fs = np.empty((K, 1))
Error_train_nofeatures = np.empty((K, 1))
Error_test_nofeatures = np.empty((K, 1))

k = 0
for train_index, test_index in CV.split(X):

    # extract training and test set for current CV fold
    X_train = X[train_index, :]
Пример #3
0
# In[ ]:

X = array[:, 0:2]

# In[ ]:

Y = array[:, 3]

# In[ ]:

seed = 1234567890

# In[ ]:

kfold = model_selection.KFold(n_splits=10, random_state=seed)

# In[ ]:

model = LogisticRegression()

# In[ ]:

scoring = 'accuracy'

# In[ ]:

results = model_selection.cross_val_score(model,
                                          X,
                                          Y,
                                          cv=kfold,
Пример #4
0
    def __cross_validation(self, classifier, X, y, k, stratify=True):
        """
		Performs classifier validation through cross-validation.
		This function is also used by leave-one-out validation.

		:param classifier: classifier for validation
		:type classifier: sklearn classifier object
		:param X: feature values of training data (including training and validation sets)
		:type X: pandas dataframe
		:param y: labels of training data
		:type y: pandas series
		:param k: number of folds
		:type k: int
		:param stratify: draw samples according to class proportions or not
		:type stratify: bool

		:returns: performance metrics on training and validation data
		"""
        if k == X.shape[0]:  # leave-one-out
            kf = model_selection.KFold(n_splits=k)
        else:
            if stratify:
                kf = model_selection.StratifiedKFold(n_splits=k,
                                                     shuffle=True,
                                                     random_state=0)
            else:
                kf = model_selection.KFold(n_splits=k,
                                           shuffle=True,
                                           random_state=0)

        # training data and predictions for each fold
        y_train_list = []
        y_train_pred_list = []
        y_train_prob_list = []
        y_val_list = []
        y_val_pred_list = []
        y_val_prob_list = []

        for train_idx, val_idx in kf.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            y_train_list.append(y_train)
            y_val_list.append(y_val)

            # catch convergence warning
            with warnings.catch_warnings():
                warnings.filterwarnings('error',
                                        category=exceptions.ConvergenceWarning)
                try:
                    classifier = classifier.fit(X_train, y_train)
                except exceptions.ConvergenceWarning:
                    Model.counter -= 1
                    raise

            y_train_pred_list.append(classifier.predict(X_train))
            y_val_pred_list.append(classifier.predict(X_val))
            y_train_prob_list.append(classifier.predict_proba(X_train))
            y_val_prob_list.append(classifier.predict_proba(X_val))

        if k == X.shape[0]:  # leave-one-out
            y_val = np.hstack(y_val_list)
            y_val_pred = np.hstack(y_val_pred_list)
            y_val_prob = np.vstack(y_val_prob_list)

            return ModelMetrics(classifier, y_train_list, y_train_pred_list, y_train_prob_list, 'cv'), \
             ModelMetrics(classifier, y_val, y_val_pred, y_val_prob, 'loo')
        else:
            return ModelMetrics(classifier, y_train_list, y_train_pred_list, y_train_prob_list, 'cv'), \
             ModelMetrics(classifier, y_val_list, y_val_pred_list, y_val_prob_list, 'cv')
Пример #5
0
def compare_predictions(df,
                        y_var_name,
                        percent_data=None,
                        category_limit=11,
                        knots=3,
                        alphas=np.logspace(start=-2, stop=10, num=50),
                        corr_matrix=True,
                        scatter_matrix=True,
                        bootstrap_coefs=True,
                        feature_importances=True,
                        partial_dep=True,
                        actual_vs_predicted=True,
                        residuals=True,
                        univariates=True,
                        compare_models=True,
                        ROC=True,
                        bootstraps=10):
    """Takes dataframe
        INPUT:
            name:
                string, a feature name to spline
            knots:
                int, number knots (divisions) which are
                divisions between splines.
        OUTPUT:
            pipeline
    """
    starttotal = time()
    df, sample_limit = clean_dataframe(df, y_var_name, percent_data)

    # REMEMBER OLD DATAFRAME

    df_unpiped, df_X_unpiped = df.copy(), df.copy().drop(y_var_name, axis=1)
    (unpiped_continuous_features,
     unpiped_category_features) = sort_features(df_X_unpiped)
    columns_unpiped = df_X_unpiped.columns

    # REMOVE CATEGORICAL VARIABLES THAT HAVE TOO MANY CATEGORIES TO BE USEFUL
    df = drop_category_exeeding_limit(df, y_var_name, category_limit)

    # SHOW CORRELATION MATRIX
    if corr_matrix:
        if len(unpiped_continuous_features) > 0:
            timeit(plt.matshow, df.sample(sample_limit).corr())

    # MAKE SCATTER MATRIX
    if scatter_matrix:
        if len(unpiped_continuous_features) > 0:
            timeit(plot_scatter_matrix, df, y_var_name, colors=True)
            plt.show()

    # TRANSFORM DATAFRAME
    print('DF COLUMNS: \n' + str(list(df.columns)) + '\n')
    df, df_X, X, y, pipeline = use_spline(df, y_var_name)
    print('DF COLUMNS AFTER TRANSFORM: \n' + str(list(df.columns)) + '\n')

    # MAKE MODELS
    (names_models, continuous_features, category_features, models, scoring,
     is_continuous, alphas) = make_models(df, df_X, y, y_var_name, univariates,
                                          alphas)

    # evaluate each model in turn
    fit_models, results, names, y_hats, errors, seed = [], [], [], [], [], 7

    for name, model in tqdm.tqdm(names_models):
        # if not linear: change df_X to df_X unpiped
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        if name == 'RR' or name == 'LASSO':
            alpha, cv_results = timeit(plot_choose_alpha, df, model,
                                       y_var_name, alphas, kfold, scoring)
            model = model(alpha)
        else:
            cv_results = timeit(cross_val_score,
                                model,
                                X,
                                y,
                                cv=kfold,
                                scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: mean=%f std=%f" % (name, cv_results.mean(),
                                      cv_results.std())
        print(msg)

        # OTHER CROSS VALIDATE METHOD:

        # FIT MODEL WITH ALL DATA
        model.fit(X, y)
        fit_models.append(model)

        # PLOT PREDICTED VS ACTUALS
        if is_continuous:
            timeit(plot_predicted_vs_actuals, df, model, y_var_name,
                   sample_limit)
            plt.show()

        # MAKE BOOTSTRAPS
        if bootstrap_coefs or partial_dep:
            bootstrap_models = bootstrap_train_premade(model,
                                                       X,
                                                       y,
                                                       bootstraps=bootstraps,
                                                       fit_intercept=False)

        # PLOT COEFFICIANTS
        if hasattr(model, "coef_"):
            coefs = model.coef_
            columns = list(df.drop(y_var_name, axis=1).columns)
            while (type(coefs[0]) is list) or (type(coefs[0]) is np.ndarray):
                coefs = list(coefs[0])
            timeit(plot_coefs, coefs=coefs, columns=columns, graph_name=name)
            plt.show()

            # PLOT BOOTSTRAP COEFFICIANTS
            if is_continuous:
                if bootstrap_coefs:
                    # PLOT BOOTSTRAP COEFS
                    fig, axs = timeit(plot_bootstrap_coefs,
                                      bootstrap_models,
                                      df_X.columns,
                                      n_col=4)
                    fig.tight_layout()
                    plt.show()

        # PLOT FEATURE IMPORTANCES
        if feature_importances:
            if 'feature_importances_' in dir(model):
                timeit(plot_feature_importances, model, df_X)
                plt.show()

        # PLOT PARTIAL DEPENDENCIES
        if partial_dep:
            timeit(plot_partial_dependences,
                   model,
                   X=df_X_unpiped,
                   var_names=unpiped_continuous_features,
                   y=y,
                   bootstrap_models=bootstrap_models,
                   pipeline=pipeline,
                   n_points=250)
            plt.tight_layout()
            plt.show()

        # PLOT PREDICTED VS ACTUALS
        plot_continuous_error_graphs(df,
                                     y,
                                     y_var_name,
                                     model,
                                     is_continuous,
                                     sample_limit,
                                     predicteds_vs_actuals=True,
                                     residuals=True)
        df_X = df.drop(y_var_name, axis=1)

        # GET ERROR
        y_hat, error = get_error(name, model, df_X, y, is_continuous)
        y_hats.append(y_hat)
        errors.append(error)

    # --COMPARE MODELS--
    if compare_models:
        choose_box_and_violin_plots(names, scoring, compare_models, results,
                                    is_continuous)
    # ROC CURVE
    if ROC:
        if not is_continuous:
            timeit(plot_rocs, models, df_X, y)
            plt.show()
    print(f'MAKE SUBSAMPLE TIME: {time() - starttotal}')
    return names, results, fit_models, pipeline, df_X, y_hats, errors
Пример #6
0
def models(df):

    df = getFeaturesForModels(df)
    array = df.values
    X = array[:, 0:22]
    Y = array[:, 23]
    validation_size = 0.20
    seed = 7
    X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
        X, Y, test_size=validation_size, random_state=seed)
    scoring = 'accuracy'
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC()))

    # # evaluate each model in turn
    results = []
    names = []
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     X_train,
                                                     Y_train,
                                                     cv=kfold,
                                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

    print "KNeighborsClassifier"
    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

    print "CART"
    CART = DecisionTreeClassifier()
    CART.fit(X_train, Y_train)
    predictions = CART.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

    print "LR"
    LR = LogisticRegression()
    LR.fit(X_train, Y_train)
    predictions = LR.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

    print "LDA"
    LDA = LinearDiscriminantAnalysis()
    LDA.fit(X_train, Y_train)
    predictions = LDA.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

    print "NB"
    NB = GaussianNB()
    NB.fit(X_train, Y_train)
    predictions = NB.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

    print "SVM"
    SVM = GaussianNB()
    SVM.fit(X_train, Y_train)
    predictions = SVM.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    precision = float(confusion_matrix(
        Y_validation, predictions)[1][1]) / float(
            (confusion_matrix(Y_validation, predictions)[1][1]) +
            (confusion_matrix(Y_validation, predictions)[0][1]))
    print precision
    print(classification_report(Y_validation, predictions))
def analyse(model_name):
    try:
        ### read in the data (HEP)
        data = pd.read_csv(
            "https://s3-us-west-2.amazonaws.com/iqoqo.temp/demo/all_train_10000.csv"
        )
        data.head()
        data_to_use = data
        data_to_use.dropna(inplace=True)
        data_to_use.head()
        values = data_to_use.values
        Y = values[:, 0]
        X = values[:, 1:28]

        print("--- starting " + model_name + " analysis ---")
        model = []
        if model_name == "LogReg":
            model.append(LogisticRegression())
        elif model_name == "SVM":
            model.append(SVC())
        elif model_name == "DecTree":
            model.append(DecisionTreeClassifier())
        elif model_name == "KNN":
            model.append(KNeighborsClassifier())
        elif model_name == "LinDisc":
            model.append(LinearDiscriminantAnalysis())
        elif model_name == "GaussianNB":
            model.append(GaussianNB())
        elif model_name == "MLP":
            model.append(MLPClassifier())
        elif model_name == "GaussianPC":
            model.append(GaussianProcessClassifier())
        elif model_name == "RandomForest":
            model.append(RandomForestClassifier())
        elif model_name == "AdaBoost":
            model.append(AdaBoostClassifier())
        elif model_name == "QuadraticDisc":
            model.append(QuadraticDiscriminantAnalysis())
        elif model_name == "SVClinear":
            model.append(SVC(kernel="linear", C=0.025))
        elif model_name == "SVCgamma":
            model.append(SVC(gamma=2, C=1))
        elif model_name == "KNN3":
            model.append(KNeighborsClassifier(3))
        elif model_name == "GaussianRBF":
            model.append(GaussianProcessClassifier(1.0 * RBF(1.0)))
        elif model_name == "DecTreeDepth":
            model.append(DecisionTreeClassifier(max_depth=5))
        elif model_name == "RandomForestDepth":
            model.append(
                RandomForestClassifier(max_depth=5,
                                       n_estimators=10,
                                       max_features=1))
        elif model_name == "MLPalpha":
            model.append(MLPClassifier(alpha=1))
        else:
            print("Model name not found: " + model_name)
            quit()

        k_fold_validation = model_selection.KFold(n_splits=10,
                                                  random_state=random_seed)
        results = model_selection.cross_val_score(model[0],
                                                  X,
                                                  Y,
                                                  cv=k_fold_validation,
                                                  scoring="accuracy")
        output_message = "%s| Mean=%f STD=%f" % (
            model_name,
            results.mean(),
            results.std(),
        )
        print(output_message)
        print("--- done " + model_name + " analysis ---")
        return model_name, results
    except:
        return model_name, []
Пример #8
0
data_test.loc[data_test["Embarked"] == "Q", "Embarked"] = 2
test_features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# 构造测试集的Survived列,
data_test["Survived"] = -1
test_predictors = data_test[test_features]
data_test["Survived"] = logRegAlg.predict(test_predictors)
print(data_test.head(10))

# 使用随机森林算法
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# 10棵决策树,停止的条件:样本个数为2,叶子节点个数为1
alg = RandomForestClassifier(random_state=1,
                             n_estimators=10,
                             min_samples_split=2,
                             min_samples_leaf=1)
kf = model_selection.KFold(n_splits=3, shuffle=False, random_state=1)
scores = model_selection.cross_val_score(alg,
                                         data_train[predictors],
                                         data_train["Survived"],
                                         cv=kf)
print(scores)
print(scores.mean())

# 增加决策树的个数到30棵决策树,交叉验证方法采用10折交叉验证
alg = RandomForestClassifier(random_state=1,
                             n_estimators=30,
                             min_samples_split=2,
                             min_samples_leaf=1)
kf = model_selection.KFold(n_splits=10, shuffle=False, random_state=1)
scores = model_selection.cross_val_score(alg,
                                         data_train[predictors],
learning_goal = 1  # stop criterion 1 (train mse to be reached)
max_epochs = 300  # stop criterion 2 (max epochs in training)
show_error_freq = 50  # frequency of training status updates

# Getting the min max range for every attribute and the y vector
# minMaxRange = [[min(X[:, 0]), max (X[:, 0])], [min(X[:, 1]), max(X[:, 1])], [min(X[:, 2]), max(X[:,2])],
#                [min(X[:, 3]), max (X[:, 3])], [min(X[:, 4]), max(X[:, 4])], [min(X[:, 5]), max(X[:,5])],
#                [min(X[:, 6]), max(X[:, 6])], [min(X[:, 7]), max(X[:, 7])]]
minMaxRange = [[0, 1]] * M

# K-fold crossvalidation
# Outer loop
K = 5
# Inner Loop
J = 3
CV_1 = model_selection.KFold(K, shuffle=False)
CV_2 = model_selection.KFold(J, shuffle=False)

# Variable for classification error
errors = np.zeros((J, K)) * np.nan
gen_errors = np.zeros(K) * np.nan
error_hist = np.zeros((max_epochs, K)) * np.nan
bestnet = list()
best_bestnet = list()
k = 0
bestnet_hidden_units = np.zeros((J, K)) * np.nan
y_best_est = []
best_performing_anns = []
best_hidden_neurons_outer = []
mean_errors = np.zeros(K) * np.nan
for train_index, test_index in CV_1.split(X, y):
Пример #10
0
    * 保存交叉验证在训练集上的结果,并保存下来计算gini

'''

start_all0 = datetime.datetime.now()

test_x,testID=deal_test()
train_x,train_y=deal_train()
train_x.fillna(0)
test_x.fillna(0)
print("*******DATA*******:{:.2f} hours".format((datetime.datetime.now()-start_all0).seconds/3600))
start_all = datetime.datetime.now()

#-----model------
n_split=5
cv_split = model_selection.KFold(n_splits=n_split, random_state=15, shuffle=False)
gbm=lgb.LGBMRegressor( objective='regression',num_leaves=6,
                              learning_rate=0.02, n_estimators=500,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.5,
                              feature_fraction_seed=None, bagging_seed=None,
                              min_data_in_leaf =1, min_sum_hessian_in_leaf = 11,min_data=1 )


#保存每个交叉验证在y_test集上的结果
y_pred_test_cv=np.ones((test_x.shape[0],n_split))
y_pred_train_cv=np.ones((train_x.shape[0],))

for i, (train_index,test_index) in enumerate(cv_split.split(train_x,train_y)):
    
    gbm.fit(train_x.iloc[train_index],train_y[train_index])
Пример #11
0
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection as ms

if __name__ == '__main__':
    df = pd.read_csv('cleaned.csv')
    data = df.copy()
    #feature selection
    cols = ['Age','SibSp','Parch','male','female','class1',\
    'class2','class3','embarkedS','embarkedC','embarkedQ']
    X = data[cols]
    y = data['Survived']
    #train-test split
    X_train, X_test, y_train, y_test = ms.train_test_split(X,
                                                           y,
                                                           test_size=0.3,
                                                           random_state=0)
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.score(X_test, y_test))

    #one step forward??(similar to ARIMA??)

    #cross validation
    kfold = ms.KFold(n_splits=10, random_state=0)
    model_cv = DecisionTreeClassifier(random_state=0)
    results_cv = ms.cross_val_score(model_cv,X_train, y_train, cv = kfold,\
     scoring = 'accuracy')
    print(results_cv.mean())

    #how to improve accuracy?? (parameters of the classifier??)
Пример #12
0

#Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
 
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


# In[ ]:


# Bagged Decision Trees for Classification
kfold = model_selection.KFold(n_splits=10, random_state=10)
model_1 = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=10)
results_1 = model_selection.cross_val_score(model_1, x, y, cv=kfold)
print(results_1.mean())


# In[32]:


# Random Forest Classification
kfold_rf = model_selection.KFold(n_splits=10)
model_rf = RandomForestClassifier(n_estimators=100, max_features=5)
results_rf = model_selection.cross_val_score(model_rf, x, y, cv=kfold_rf)
print(results_rf.mean())

Пример #13
0
# split the model into train and test set
iris_data_train, iris_data_test, iris_targets_train , iris_targets_test = \
    model_selection.train_test_split(iris_data, iris_targets, test_size=0.25)

# K-nearest neighbor classifier
knn_iris = neighbors.KNeighborsClassifier(n_neighbors=13)
knn_iris.fit(iris_data_train, iris_targets_train)

iris_target_pred_knn = knn_iris.predict(iris_data_test)
# Another way to get accuracy explicitly is: np.mean(y == y_pred)
print("KNN Accuracy (Iris Dataset):",
      metrics.accuracy_score(iris_targets_test, iris_target_pred_knn))

# K-fold cross validation
iris_kfold = model_selection.KFold(n_splits=4, shuffle=True)
cv_score = model_selection.cross_val_score(knn_iris,
                                           X=iris_data,
                                           y=iris_targets,
                                           cv=iris_kfold)
print("Cross-validation score is %s" % cv_score,
      "Mean CV is %s" % np.mean(cv_score))

# Compare the performance of kNN for different k:
k_accuracy_scores = np.zeros((49, 400))
# for k in range(2, 51):
#     for rep in range(1, 400):
#         iris_data_train, iris_data_test, iris_targets_train, iris_targets_test = \
#             model_selection.train_test_split(iris_data, iris_targets, test_size=0.25)
#         knn_test_iris = neighbors.KNeighborsClassifier(n_neighbors=k)
#         knn_test_iris.fit(iris_data_train, iris_targets_train)
Пример #14
0
def main():
    # load dataset
    url_dataset = "script/mushrooms.csv"
    dataset = pandas.read_csv(url_dataset)
    # check the attibutes
    get_unique_attribute(dataset)
    #remove veil-type attribute
    del dataset['veil-type']
    #print(dataset.columns.values)
    # encode the attributes with LabelEncoder
    to_be_encoded_cols = dataset.columns.values
    label_encode(dataset, to_be_encoded_cols)
    # check the attibutes after encoded
    #get_unique_attribute(dataset)
    # split-out validation dataset
    array = dataset.values
    X = array[:, 0:22]
    Y = array[:, 0]
    validation_size = 0.20
    seed = 7
    scoring = "accuracy"
    X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
        X, Y, test_size=validation_size, random_state=seed)
    # Spot Check Algorithms
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('TREE', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC()))
    # evaluate each model in turn
    results = []
    names = []
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     X_train,
                                                     Y_train,
                                                     cv=kfold,
                                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()
    # feature extraction
    test = SelectKBest(score_func=chi2, k=4)
    fit = test.fit(X, Y)
    # summarize scores
    np.set_printoptions(precision=3)
    print(fit.scores_)
    # feature extraction
    model = LogisticRegression()
    rfe = RFE(model, 3)
    fit = rfe.fit(X, Y)
    print("Num Features: {}".format(fit.n_features_))
    print("Selected Features: {}".format(fit.support_))
    print("Feature Ranking: {}".format(fit.ranking_))
    # feature extraction
    model = ExtraTreesClassifier()
    model.fit(X, Y)
    print(model.feature_importances_)
    print("The Conclusion")
    print(
        "Best Machine Learning Type for Mushroom Dataset is Binary Classification Model"
    )
    print(
        "because basicaly, the model is only divided to be two classes that are edible(e) and poisonous(p)"
    )
    print(
        "The Best Algorithm for Mushroom Dataset is DecisionTree, because DecisionTree is the most stable than others"
    )
    print(
        "If we change the seed,numbers of dataset itself whether increase or reduce,we can see that the accuracy of decision tree is highest and stable"
    )
    print(
        "As We can see at the result above(there are three feature selections/extractions), and we can pull the decision that Odor is the indicatived feature"
    )
    print(
        "The Most Importance or Indicative Feature/Attribute is Odor and it has big influence to predict whether the mushroom is edible/poisonous"
    )
    print(
        "Odor with value except Almond/None/Anise tend to be most indicatived attribute of poisonous mushroom"
    )
    return K.sqrt(K.mean(K.square(y_pred - y_true)))


i = 0
nbag = 1
nfold = 5
oobval = np.zeros((train_df.shape[0], 2))
oobtest = np.zeros((test_df.shape[0], 2))
valerr = []
val_scores = []

np.random.seed(2018)
for x in np.arange(nbag):
    for seed in [2018]:
        kf = model_selection.KFold(n_splits=nfold,
                                   shuffle=True,
                                   random_state=seed)
        for dev_index, val_index in kf.split(y):
            dev_X, val_X = train_df.values[dev_index, :], train_df.values[
                val_index, :]
            dev_y, val_y = y[dev_index], y[val_index]
            param_train_tfidf_dev, param_train_tfidf_val = param_train_tfidf[
                dev_index, :], param_train_tfidf[val_index, :]
            title_train_tfidf_dev, title_train_tfidf_val = title_train_tfidf[
                dev_index, :], title_train_tfidf[val_index, :]
            desc_train_tfidf_dev, desc_train_tfidf_val = desc_train_tfidf[
                dev_index, :], desc_train_tfidf[val_index, :]
            region_dev, region_val = region_train[dev_index, :], region_train[
                val_index, :]
            pcn_dev, pcn_val = pcn_train[dev_index, :], pcn_train[val_index, :]
            cn_dev, cn_val = cn_train[dev_index, :], cn_train[val_index, :]
Пример #16
0
def dataupload():
    if request.method == 'POST' and 'csv_data' in request.files:
        file = request.files['csv_data']
        filename = secure_filename(file.filename)
        # os.path.join is used so that paths work in every operating system
        # file.save(os.path.join("wherever","you","want",filename))
        file.save(os.path.join('static/uploadsDB', filename))
        fullfile = os.path.join('static/uploadsDB', filename)

        # For Time
        date = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime("%Y-%m-%d %H:%M:%S"))

        # EDA function
        df = pd.read_csv(os.path.join('static/uploadsDB', filename))
        df_size = df.size
        df_shape = df.shape
        df_columns = list(df.columns)
        df_targetname = df[df.columns[-1]].name
        df_featurenames = df_columns[0:
                                     -1]  # select all columns till last column
        df_Xfeatures = df.iloc[:, 0:-1]
        df_Ylabels = df[df.columns[-1]]  # Select the last column as target
        # same as above df_Ylabels = df.iloc[:,-1]

        # Model Building
        X = df_Xfeatures
        Y = df_Ylabels
        seed = 7
        # prepare models
        models = []
        models.append(('LR', LogisticRegression()))
        models.append(('LDA', LinearDiscriminantAnalysis()))
        models.append(('KNN', KNeighborsClassifier()))
        models.append(('CART', DecisionTreeClassifier()))
        models.append(('NB', GaussianNB()))
        models.append(('SVM', SVC()))
        # evaluate each model in turn

        results = []
        names = []
        allmodels = []
        scoring = 'accuracy'
        for name, model in models:
            kfold = model_selection.KFold(n_splits=10, random_state=seed)
            cv_results = model_selection.cross_val_score(model,
                                                         X,
                                                         Y,
                                                         cv=kfold,
                                                         scoring=scoring)
            results.append(cv_results)
            names.append(name)
            msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
            allmodels.append(msg)
            model_results = results
            model_names = names

        # Saving Results of Uploaded Files  to Sqlite DB
        newfile = FileContents(name=file.filename,
                               data=file.read(),
                               modeldata=msg)
        db.session.add(newfile)
        db.session.commit()

    return render_template('details.html',
                           filename=filename,
                           date=date,
                           df_size=df_size,
                           df_shape=df_shape,
                           df_columns=df_columns,
                           df_targetname=df_targetname,
                           model_results=allmodels,
                           model_names=names,
                           fullfile=fullfile,
                           dfplot=df)
Пример #17
0
def binaryClassifiers(train_bag, train_class, test_bag, test_class):
    print "Naive Bayes"
    start = time.time()
    naive = MultinomialNB()
    predictFitBinary(naive, train_bag, train_class, test_bag, test_class)
    end = time.time()
    print "time: ", (end - start)
    # calcROC(naive, test_bag, test_class, 'NB')
    print

    print "Logistic Regression"
    start = time.time()
    logreg = linear_model.LogisticRegression()
    predictFitBinary(logreg, train_bag, train_class, test_bag, test_class)
    end = time.time()
    print "time: ", (end - start)
    # calcROC(logreg, test_bag, test_class, 'LR' )
    print

    print "SVM (Linear Kernel)"
    svml = SVC(kernel='linear', probability=True)
    start = time.time()
    predictFitBinary(svml, train_bag, train_class, test_bag, test_class)
    end = time.time()
    print "time: ", (end - start)
    # calcROC(svml, test_bag, test_class, 'SVML')
    print

    print "SVM (Gaussian Kernel)"
    start = time.time()
    svmg = SVC(kernel='rbf', probability=True)
    predictFitBinary(svmg, train_bag, train_class, test_bag, test_class)
    end = time.time()
    print "time: ", (end - start)
    # calcROC(svmg, test_bag, test_class, 'SVMG')
    print

    print "Decision Tree"
    start = time.time()
    dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
    predictFitBinary(dt, train_bag, train_class, test_bag, test_class)
    end = time.time()
    print "time: ", (end - start)
    # calcROC(dt, test_bag, test_class, 'DT')

    print "K Nearest Neighbors"
    start = time.time()
    knn = linear_model.LogisticRegression()
    predictFitBinary(logreg, train_bag, train_class, test_bag, test_class)
    end = time.time()
    print "time: ", (end - start)
    # calcROC(logreg, test_bag, test_class, 'LR' )
    print

    # adapted from http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/
    print "Random Forest Classifier"
    seed = 7
    num_trees = 100
    max_features = 100
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    rf = RandomForestClassifier(n_estimators=num_trees,
                                max_features=max_features)
    predictFitBinary(rf, train_bag, train_class, test_bag, test_class)
    results = model_selection.cross_val_score(rf,
                                              train_bag,
                                              train_class,
                                              cv=kfold)
    print(results.mean())
Пример #18
0
def main(configuration_path, signal_path, predictions_path, disp_model_path,
         sign_model_path, key, verbose):
    '''
    Train two learners to be able to reconstruct the source position.
    One regressor for disp and one classifier for the sign of delta.

    Both pmml and pickle format are supported for the output.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    DISP_MODEL_PATH: Path to save the disp model to.

    SIGN_MODEL_PATH: Path to save the disp model to.
        Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''

    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.disp

    np.random.seed(config.seed)

    disp_regressor = model_config.disp_regressor
    sign_classifier = model_config.sign_classifier

    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    log.info('Loading data')
    df = read_telescope_data(
        signal_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal)
    log.info('Total number of events: {}'.format(len(df)))

    source_x, source_y = horizontal_to_camera(
        az=df[model_config.source_az_column],
        zd=df[model_config.source_zd_column],
        az_pointing=df[model_config.pointing_az_column],
        zd_pointing=df[model_config.pointing_zd_column],
    )

    df['true_disp'], df['true_sign'] = calc_true_disp(
        source_x,
        source_y,
        df[model_config.cog_x_column],
        df[model_config.cog_y_column],
        df[model_config.delta_column],
    )

    # generate features if given in config
    if model_config.feature_generation:
        feature_generation(df, model_config.feature_generation, inplace=True)

    df_train = convert_to_float32(df[config.disp.features])
    df_train.dropna(how='any', inplace=True)

    log.info('Events after nan-dropping: {} '.format(len(df_train)))

    target_disp = df['true_disp'].loc[df_train.index]
    target_sign = df['true_sign'].loc[df_train.index]

    log.info('Starting {} fold cross validation... '.format(
        model_config.n_cross_validations))
    scores_disp = []
    scores_sign = []
    cv_predictions = []

    kfold = model_selection.KFold(
        n_splits=model_config.n_cross_validations,
        shuffle=True,
        random_state=config.seed,
    )

    for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))):

        cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]

        cv_disp_train, cv_disp_test = target_disp.values[
            train], target_disp.values[test]
        cv_sign_train, cv_sign_test = target_sign.values[
            train], target_sign.values[test]

        disp_regressor.fit(cv_x_train, cv_disp_train)
        cv_disp_prediction = disp_regressor.predict(cv_x_test)

        sign_classifier.fit(cv_x_train, cv_sign_train)
        cv_sign_prediction = sign_classifier.predict(cv_x_test)

        scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction))
        scores_sign.append(
            metrics.accuracy_score(cv_sign_test, cv_sign_prediction))

        cv_predictions.append(
            pd.DataFrame({
                'disp': cv_disp_test,
                'disp_prediction': cv_disp_prediction,
                'sign': cv_sign_test,
                'sign_prediction': cv_sign_prediction,
                'cv_fold': fold
            }))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)

    log.info('writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    scores_disp = np.array(scores_disp)
    scores_sign = np.array(scores_sign)
    log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp))
    log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format(
        scores_disp.mean(), scores_disp.std()))

    log.info('Cross validated accuracy for the sign: {}'.format(scores_sign))
    log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format(
        scores_sign.mean(), scores_sign.std()))

    log.info('Building new model on complete data set...')
    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    disp_regressor.random_state = config.seed
    sign_classifier.random_state = config.seed

    disp_regressor.fit(df_train.values, target_disp.values)
    sign_classifier.fit(df_train.values, target_sign.values)

    log.info('Pickling disp model to {} ...'.format(disp_model_path))
    pickle_model(
        disp_regressor,
        feature_names=list(df_train.columns),
        model_path=disp_model_path,
        label_text='disp',
    )
    log.info('Pickling sign model to {} ...'.format(sign_model_path))
    pickle_model(
        sign_classifier,
        feature_names=list(df_train.columns),
        model_path=sign_model_path,
        label_text='disp',
    )
Пример #19
0
X = np.array(feature_list)
# Fit a per-column scaler
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
X_train = X_scaler.transform(X)
y_train = np.array(label_list)

# Convert label strings to numerical encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

# Create classifier
clf = svm.SVC(kernel='linear')

# Set up 5-fold cross-validation
kf = model_selection.KFold(  #len(X_train),
    n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation
scores = model_selection.cross_val_score(cv=kf,
                                         estimator=clf,
                                         X=X_train,
                                         y=y_train,
                                         scoring='accuracy')
print('Scores: ' + str(scores))
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std()))

# Gather predictions
predictions = model_selection.cross_val_predict(cv=kf,
                                                estimator=clf,
                                                X=X_train,
                                                y=y_train)
Пример #20
0
# irisデータセット
iris = datasets.load_iris()

# 交差検証の分割数
splits = 5
# 交差検証のスコア
score_linier = 0
score_poly = 0
score_rbf = 0
score_kneighbors = 0
score_dtree = 0
score_randomfr = 0

# 訓練データとテストデータに分割
kf = model_selection.KFold(n_splits=splits).split(iris.data)
for train_idx, test_idx in kf:
    train_data = iris.data[train_idx]
    train_target = iris.target[train_idx]
    test_data = iris.data[test_idx]
    test_target = iris.target[test_idx]

    # モデルを学習
    clf1 = svm.LinearSVC(max_iter=10000)
    clf1.fit(train_data, train_target)
    clf2 = svm.SVC(kernel='poly', degree=3, gamma='scale')
    clf2.fit(train_data, train_target)
    clf3 = svm.SVC(kernel='rbf', gamma='scale')
    clf3.fit(train_data, train_target)
    clf4 = neighbors.KNeighborsClassifier(n_neighbors=6)
    clf4.fit(train_data, train_target)
###############################

# blank list to store results
print("\n*** Cross Validation Init ***")
xvModNames = []
xvAccuracy = []
xvSDScores = []
print("Done ...")

# cross validation
from sklearn import model_selection
print("\n*** Cross Validation ***")
# iterate through the lModels
for vModelName, oModelObj in lModels:
    # select xv folds
    kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=707)
    # actual corss validation
    cvAccuracy = cross_val_score(oModelObj, X, y, cv=kfold, scoring='accuracy')
    # prints result of cross val ... scores count = lfold splits
    print(vModelName, ":  ", cvAccuracy)
    # update lists for future use
    xvModNames.append(vModelName)
    xvAccuracy.append(cvAccuracy.mean())
    xvSDScores.append(cvAccuracy.std())

# cross val summary
print("\n*** Cross Validation Summary ***")
# header
msg = "%10s: %10s %8s" % ("Model   ", "xvAccuracy", "xvStdDev")
print(msg)
# for each model
                 l2_reg=3.535679697949907e-05,
                 learning_rate=0.0008170485394812195,
                 representation_name='acsf',
                 representation_params=acsf_params,
                 tensorboard=True,
                 store_frequency=25,
                 hidden_layer_sizes=(15, 88))

estimator.set_properties(ene_isopent)
estimator.generate_representation(xyz_isopent, zs_isopent, method="fortran")

# Training the model on 3 folds of n samples
for n in n_samples:

    cv_idx = idx_train[:n]
    splitter = modsel.KFold(n_splits=3, random_state=42, shuffle=True)
    indices = splitter.split(cv_idx)

    scores_per_fold = []
    traj_scores_per_fold = []

    for item in indices:
        idx_train_fold = cv_idx[item[0]]
        idx_test_fold = cv_idx[item[1]]

        estimator.fit(idx_train_fold)

        # Scoring the model
        score = estimator.score(idx_test_fold)
        traj_score = estimator.score(idx_test)
        scores_per_fold.append(score)
Пример #23
0
attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()]
classNames = [name[0] for name in mat_data['classNames'].squeeze()]
N, M = X.shape
C = len(classNames)

# Parameters for neural network classifier
n_hidden_units = 4  # number of hidden units
n_train = 2  # number of networks trained in each k-fold

# These parameters are usually adjusted to: (1) data specifics, (2) computational constraints
learning_goal = 2.0  # stop criterion 1 (train mse to be reached)
max_epochs = 200  # stop criterion 2 (max epochs in training)

# K-fold CrossValidation (4 folds here to speed up this example)
K = 4
CV = model_selection.KFold(K, shuffle=True)

# Variable for classification error
errors = np.zeros(K) * np.nan
error_hist = np.zeros((max_epochs, K)) * np.nan
bestnet = list()
k = 0
for train_index, test_index in CV.split(X, y):
    print('\nCrossvalidation fold: {0}/{1}'.format(k + 1, K))

    # extract training and test set for current CV fold
    X_train = X[train_index, :]
    y_train = y[train_index, :]
    X_test = X[test_index, :]
    y_test = y[test_index, :]
Пример #24
0
# train = train[predict_col]
# train = scipy.sparse.hstack([scipy.sparse.csr_matrix(train), desc_train, title_train])
test_x = test[predict_col]
test_x = scipy.sparse.hstack(
    [scipy.sparse.csr_matrix(test_x), desc_test, title_test])
test_x = test_x.tocsr()

train_y = train["deal_probability"]
train_x = train[predict_col]
train_x = scipy.sparse.hstack(
    [scipy.sparse.csr_matrix(train_x), desc_train, title_train])
train_x = train_x.tocsr()
timer.time("prepare train in ")

split_num = 4
skf = model_selection.KFold(n_splits=split_num, shuffle=False)
lgb = pocket_lgb.GoldenLgb()
total_score = 0
models = []

for train_index, test_index in skf.split(train):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
    model = lgb.do_train_avito(X_train, X_test, y_train, y_test, lgb_col)
    score = model.best_score["valid_0"]["rmse"]
    total_score += score

    models.append(model)

lgb.show_feature_importance(models[0])
print("average score= ", total_score / split_num)
Пример #25
0
raw_X = combineSeqData(Data)  # 原始数据
lbp_X = combineLBPSeqData(Data)  # 使用LBP进行特征提取后的数据
y = np.array(Data['Face'].values)

# pca
pca = PCA(n_components=76, svd_solver='auto', whiten=True).fit(raw_X)
pca_X = pca.transform(raw_X)

# 建立模型(决策树/随机森林)
clf0 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=1)
clf1 = RandomForestClassifier(max_depth=6)

num_folds = 10
scoring = 'accuracy'
for name, data in (["RAW", raw_X], ["PCA", pca_X], ["LBP", lbp_X]):
    kfold = model_selection.KFold(n_splits=num_folds)
    cv_results0 = model_selection.cross_val_score(clf0,
                                                  data,
                                                  y,
                                                  cv=kfold,
                                                  scoring=scoring)  # 决策树
    cv_results1 = model_selection.cross_val_score(clf1,
                                                  data,
                                                  y,
                                                  cv=kfold,
                                                  scoring=scoring)  # 随机森林
    msg0 = "%s DT: %f (%f)" % (name, cv_results0.mean(), cv_results0.std())
    print(msg0)
    msg1 = "%s RF: %f (%f)" % (name, cv_results1.mean(), cv_results1.std())
    print(msg1)
Пример #26
0
    def mltest(self, freqs):
        if freqs[0] is not "" and freqs[1] is not "" and freqs[2] is not "":
            print(freqs)
            q = 'select tone1, tone2, tone3, tone_key from tones'
            df = pd.read_sql(q, self.con)
            df.head()
            df_size = df.size
            df_shape = df.shape
            df_columns = list(df.columns)
            df_targetname = df[df.columns[-1]].name
            df_featurenames = df_columns[
                1:-1]  # select all columns till last column
            df_Xfeatures = df.iloc[:, 1:-1]
            df_Ylabels = df[df.columns[-1]]

            # Model Building
            X = df_Xfeatures
            Y = df_Ylabels
            seed = 528
            # prepare models
            models = []
            models.append(('K-Neighbours', KNeighborsClassifier()))
            models.append(('Classification and Regression Tree',
                           DecisionTreeClassifier()))
            models.append(('Gaussian Naive Bayes', GaussianNB()))
            models.append(('SVC', svm.SVC()))
            X_train, X_test, y_train, y_test = model_selection.train_test_split(
                X, Y, test_size=0.1, random_state=42)
            # clf = svm.SVC()
            # clf.fit(X, Y)

            # print(X_test)

            # evaluate each model in turn

            train_size = 500

            results = []
            names = []
            allmodels = []
            scoring = 'accuracy'
            for name, model in models:
                print(name + " Prediction: ")
                model.fit(X, Y)
                # print(model.predict([[1.261802575107296, 1.497854077253219]])) ##Major
                # print(model.predict([[1.1888412017167382, 1.497854077253219]]))  ##Minor
                # print(model.predict([[1.6823104693140793, 10.086642599277978]])) ##Minor
                # print(model.predict([[2.3776824034334765, 5.9957081545064375]])) ##Minor
                # print(model.predict([[1.259090909090909, 11.986363636363636]])) ##Major
                # print(model.predict([[1.5884476534296028, 4.759927797833935]])) ##Major
                # print(model.predict([[1.1888412017167382, 2.0]])) ##Minor
                user_in_processed = float(freqs[1]) / float(freqs[0]), float(
                    freqs[2]) / float(freqs[0])

                predictions = (model.predict([[
                    float(freqs[1]) / float(freqs[0]),
                    float(freqs[2]) / float(freqs[0])
                ]]))
                predict_text = [i.strip('[]') for i in predictions]
                ratios = float(freqs[0]) / float(freqs[0]), float(
                    freqs[1]) / float(freqs[0]), float(freqs[2]) / float(
                        freqs[0])
                kfold = model_selection.KFold(n_splits=10, random_state=seed)
                cv_results = model_selection.cross_val_score(model,
                                                             X,
                                                             Y,
                                                             cv=kfold,
                                                             scoring=scoring)

                results.append(cv_results)
                names.append(name)
                msg = "%s: %f (%f) %s %s" % (name, cv_results.mean(),
                                             cv_results.std(), predict_text,
                                             ratios)
                allmodels.append(msg)
                model_results = results
                model_names = names
                print("Name: ")
                print(name)
                print("Message: ")
                print(msg)
                print("Results: ")
                print(cv_results)

            print("Results: ")
            print(*results, sep="\n")
            print("Models: ")
            print(*allmodels, sep="\n")

            print(df_targetname)

            return render_template('details.html',
                                   df_size=df_size,
                                   df_shape=df_shape,
                                   df_columns=df_columns,
                                   df_targetname=df_targetname,
                                   model_results=allmodels,
                                   model_names=names,
                                   dfplot=df)

        else:
            return render_template('index_error.html')
Пример #27
0
        "num_leaves" : 30,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=200, evals_result=evals_result)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
pred_test_full = 0
for dev_index, val_index in kf.split(X_train):
    dev_X, val_X = X_train.loc[dev_index,:], X_train.loc[val_index,:]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
    pred_test_full += pred_test
pred_test_full /= 5.
pred_test_full = np.expm1(pred_test_full)

sub_df = pd.DataFrame({"ID":test_df["ID"].values})
sub_df["target"] = pred_test_full
sub_df.to_csv("baseline_lgb_pca.csv", index=False)
Пример #28
0
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    df = pd.read_csv("../input/train.csv")

    df["kfold"] = -1

    df = df.sample(frac=1).reset_index(drop=True)

    kf = model_selection.KFold(n_splits=5)
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = fold

    df.to_csv("../input/train_folds.csv", index=False)
            min_weight_fraction_leaf=0.0,
            #max_features=None, # number of features to consider when looking for the best split; None: max_features=n_features
            max_features="sqrt",
            max_leaf_nodes=None,  # None: unlimited number of leaf nodes
            bootstrap=True,
            oob_score=True,  # estimate Out-of-Bag Cross Entropy
            n_jobs=multiprocessing.cpu_count() - 4,  # paralellize over all CPU cores minus 4
            #class_weight=None,  # our classes are skewed, but but too skewed
            #class_weight={0:0.2,1:0.8},
            #class_weight={0:0.4,1:0.6},
            class_weight={0:0.6,1:0.4},
            random_state=RANDOM_SEED,
            verbose=0,
            warm_start=False)

    kfold = model_selection.KFold(n_splits=5, random_state=RANDOM_SEED)
    eval_standard = ['accuracy', 'recall_macro', 'precision_macro', 'f1_macro']
    results = []
    for scoring in eval_standard:
        cv_results = model_selection.cross_val_score(model,
                                                     X_train,
                                                     y_train,
                                                     scoring=scoring,
                                                     cv=kfold)
        results.append(cv_results)
        msg = "%s: %f (%f)" % (scoring, cv_results.mean(), cv_results.std())
        print(msg)
    # Make predictions on validation dataset
    test_df = pandas.read_csv(test_fullpath,
                              sep=',',
                              na_values='NA',
Пример #30
0
import pandas as pd
import numpy as np
import sklearn.linear_model as lm
from sklearn import model_selection
from main import *

feature = [0, 2, 4, 6, 7, 8]
target = [12]
X = data[:, feature]
y = data[:, target]
N, M = X.shape
K = 10

result_lambda = np.empty(K)

CV = model_selection.KFold(K, shuffle=True, random_state=1)

f = 0
y = y.squeeze()
fold_size = np.empty(K)
lambda_interval = np.logspace(-20, 10, 100)
train_error_rate = np.zeros(len(lambda_interval))
test_error_rate = np.zeros(len(lambda_interval))
genErrors = dict()
trainErrors = dict()

for i in range(0, len(lambda_interval)):
    genErrors[i] = []
    trainErrors[i] = []

for train_index, val_index in CV.split(X, y):