Python split_data 예제들, functions.split_data Python 예제들

예제 #1

0

파일 보기

parser = argparse.ArgumentParser(
    description='T-test to see if TAN and naive are different')
parser.add_argument('-data', type=str, help='Data set path')
args = parser.parse_args()
data = args.data
data = data.replace('\r', '')  #Removes the carriage return cuz I use windows

#Load the data in
with open(data, "r") as read_file:
    data = json.load(read_file)
metadata = np.array(data['metadata']['features'])

#Build a list of list containing accuracy for naive and TAN
accuracy = []
for i in range(0, 10):
    split = split_data(data, i, 10)
    train = split[0]
    test = split[1]
    accuracy.append(ttest(train, test, metadata))
accuracy = np.array(accuracy)
differences = accuracy.T[1] - accuracy.T[0]
print("Accuracy (NB, TAN): \n", accuracy)
print("Differences (TAN - NB): \n", differences)

#Calculate average accuracy difference:
diff_avg = np.mean(differences)

#Calculate standard deviation of differences
diff_sd = np.std(differences)

#Calculate standard error from SD

예제 #2

0

파일 보기

method = ['DBSCAN']

#driver_main, impostor, n_clusters, selected_features, window_size, method = fc.general_parameters()
n_estimators = fc.if_parameters()
kernel, nu = fc.ocsvm_parameters()

for driver in driver_main:
  for ws in window_size:
    print("Building DF for Driver", driver, "with Window_Size", ws)
    data_final = fc.build_df_final(data_normalized, driver, ws, selected_features)
    
    print('Bulding DF with all impostors')
    data_impostor, impostores = fc.build_impostors_df(data_normalized, impostor, ws, selected_features, driver)

    print('Doing data split')
    x_train, x_val = fc.split_data(data_final)

    for c in n_clusters:
      print('Create clusters')
      labels_train, centroid_train, x_train_class = fc.clusters_of_maneuvers(x_train, c)
      
      for m in method:
        if (m=='DBSCAN'):
          print('Training DBSCAN')
          dbscan_list, eps_list = fc.train_model_dbscan(labels_train, centroid_train, x_train_class, x_val)
          
          print('Doing predictions DBSCAN')
          result = fc.test_model_dbscan(dbscan_list, data_final, data_impostor, centroid_train, x_train_class)

          print('Evaluanting the results')
          acc, min_man, media_man, max_man, deviation = fc.evaluating_result(result, ws)

예제 #3

0

파일 보기

from keras.layers import Dropout
from keras.models import model_from_json
from keras.models import load_model
from nltk.tokenize import RegexpTokenizer

path = '/home/mark/Research'
data_dir = path + '/data'

train = True
load_all = True

weight_matrix, word_index = functions.load_embeddings(data_dir +
                                                      '/glove.6B.100d.txt')

data = functions.read_data(data_dir)
train, test, val = functions.split_data(data, .8, data_dir)

train = train.reset_index()
test = test.reset_index()
val = val.reset_index()

#max_length, avg_words, seq_length = functions.maxLen(data)
train_x = functions.pipeline(train, word_index, weight_matrix)
test_x = functions.pipeline(test, word_index, weight_matrix)
val_x = functions.pipeline(val, word_index, weight_matrix)

train_y = functions.labels(train)
test_y = functions.labels(test)
val_y = functions.labels(val)

print 'Training data: '

예제 #4

0

파일 보기

# get data
import functions as f
#import cv2
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn import datasets, svm, metrics

data = f.get_array_from_images('images_no_copies')
dev, test = f.split_data(data, 0.2)
training_data, training_labels = f.reshape(dev)
test_data, test_labels = f.reshape(test)

classifier = svm.SVC(gamma=0.001)
classifier.fit(training_data, training_labels)
score = classifier.score(test_data, test_labels)

print(score)

예제 #5

0

파일 보기

    "Вас вітає мережа магазинів WINTIME, якщо вы хочете дізнатися інформацію про магазин в вашому місті - введіть назву свого міста"
)
print(
    "Команди: /all - подивитися доступні міста, /add - додати місто, /edit - редагувати місто, /delete - видалити місто"
)
print("==" * 20)
while True:
    user_input = input("Ввести дані ( для виходу - exit ): ")
    if user_input != "exit" and user_input not in options:
        get = wc.getInfoCities(user_input)

    elif user_input.strip() == "/add":
        next_input = input(
            "Ведіть назву міста, адресу, телефон та години роботи через кому (city,address,phone,time): "
        )
        new_city = f.split_data(next_input)
        if len(new_city) < 4:
            print("Ви ввели недостатньо даних, спробуйте ще раз:(")
        else:
            add = wc.addCity(new_city)

    elif user_input.strip() == "/edit":
        # print("Введіть назву міста, яке ви хочете редагувати\nДоступні міста")
        # cities = wc.getCities()
        edit_city = input(
            "Введіть назву міста та НОВІ ДАНІ через кому (city, new address, new phone, new time):"
        )
        upd_city = f.split_data(edit_city)
        print(upd_city)
        edit = wc.editCity(upd_city)

예제 #6

0

파일 보기

파일: lm_sgd.py 프로젝트: tomis9/cookbook

logging.config.dictConfig(config)
logger = logging.getLogger('base')

# 2
logger.info("setting metaparameters")
n_epochs = 10
learning_rate = 0.01
batch_size = 100
logger.info("n_epochs: {}, learning_rate: {}, batch_size: {}"
            .format(n_epochs, learning_rate, batch_size))

# 3
logger.info("data preparation")
X_np, y_np = functions.get_data()
m, n = X_np.shape
X_split, y_split, n_batches = functions.split_data(X_np, y_np, batch_size, m)


# 4
logger.info("starting construction phase")
X = tf.placeholder(tf.float32, shape=(None, n), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")

theta = tf.Variable(tf.random_uniform([n, 1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")

# 5
with tf.name_scope("loss") as scope:
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name="mse")

예제 #7

0

파일 보기

def main():
    st.title("Machine Learning Binary Classification Web App")
    st.markdown("Classifying the mushrooms 🍄 to be Edible or Poisonous")
    st.markdown("(Configure classifier options on the left side menu)")
    st.sidebar.title("Machine Learning Binary Classification Web App")
    st.sidebar.markdown(
        "Classifying the mushrooms 🍄 to be Edible or Poisonous")

    df = load_data()

    if st.checkbox("Show raw data", True):
        st.subheader("Mushroom Data Set (Classification")
        st.write(df)

    x_train, x_test, y_train, y_test = split_data(df)
    class_names = ['edible', 'poisonous']
    st.sidebar.subheader("Choose Classifier")
    classifier = st.sidebar.selectbox(
        "Classifier", ("Support Vector Machine (SVM)", "Logistic Regression",  "Random Forest"))

    if classifier == "Support Vector Machine (SVM)":
        st.sidebar.subheader("Model Hyperparameters")
        C = st.sidebar.number_input(
            "C (Regularisation parameter)", 0.01, 10.0, step=0.01, key="C")
        kernel = st.sidebar.radio("Kernel", ("rbf", "linear"), key="kernel")
        gamma = st.sidebar.radio(
            "Gamma (Kernal Coefficient)", ("scale", "auto"), key="gamma")
        metrics = st.sidebar.multiselect("What metrics to plot?", (
            'Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'), key='metrics')

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Support Vector Machine (SVM) Results: ")
            model = SVC(C=C, kernel=kernel, gamma=gamma)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write("Precision: ", precision_score(
                y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ", recall_score(
                y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics, model, x_test, y_test, class_names)

    if classifier == "Logistic Regression":
        st.sidebar.subheader("Model Hyperparameters")
        C = st.sidebar.number_input(
            "C (Regularisation parameter)", 0.01, 10.0, step=0.01, key="C_LR")
        max_iter = st.sidebar.slider(
            "Maximum number of iterations", 100, 500, key='max_iter')
        metrics = st.sidebar.multiselect("What metrics to plot?", (
            'Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'), key='metrics')

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Logistic Regression Results: ")
            model = LogisticRegression(C=C, max_iter=max_iter)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write("Precision: ", precision_score(
                y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ", recall_score(
                y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics, model, x_test, y_test, class_names)

    if classifier == "Random Forest":
        st.sidebar.subheader("Model Hyperparameters")
        n_estimators = st.sidebar.number_input(
            "The number of trees in the forest", 100, 5000, step=10, key='n_estimators')
        max_depth = st.sidebar.number_input(
            "The maximum depth of the tree", 1, 20, step=1, key='max_depth')
        bootstrap = st.sidebar.radio(
            "Bootstrap samples when building trees", ('True', 'False'), key='bootstrap')

        metrics = st.sidebar.multiselect("What metrics to plot?", (
            'Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'), key='metrics')

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Random Forest Results: ")
            model = RandomForestClassifier(
                n_estimators=n_estimators, max_depth=max_depth, bootstrap=bootstrap, n_jobs=-1)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write("Precision: ", precision_score(
                y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ", recall_score(
                y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics, model, x_test, y_test, class_names)

예제 #8

0

파일 보기

# get data
from functions import get_all_data, split_data, reshape
import cv2
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn import datasets, svm, metrics
import os
import cv2
import pickle

data = get_all_data()
dev, test = split_data(data, 0.2)
training_data, training_labels = reshape(dev)
test_data, test_labels = reshape(test)

classifier = svm.SVC(gamma=0.001)
classifier.fit(training_data, training_labels)
# save the model to disk
filename = 'model.sav'
pickle.dump(classifier, open(filename, 'wb'))
 




#score = classifier.score(test_data, test_labels)

#print(score)

예제 #9

0

파일 보기

def run_regression(X,
                   z,
                   reg_string,
                   polydegree,
                   lambdas,
                   N_bs,
                   K,
                   test_size,
                   scale,
                   max_iter=50000):
    """
    Runs the selected regression methods for the input design matrix, p's, lambdas, and using
    the resampling methods as specified.
    While there may be several ways I could have done this more optimally, this function exists
    because a rather late attempt at restructuring the code in order to reduce the amount of duplicate
    lines of code regarding regression, that had just escalated out of control, making it extremely
    difficult to debug and finding whatever was causing all the issues.
    :param X: (N, p) array containing input design matrix
    :param z: (N, 1) array containing data points
    :param reg_string: string containing the name of the regression method to be used
    :param polydegree: list/range of the different p-values to be used
    :param lambdas: array of all the lambda values to be used
    :param N_bs: int, number of Bootstraps
    :param K: int, number of folds in the Cross-Validation
    :param test_size: float, size of the test partition [0.0, 1.0]
    :param scale: list determining if the scaling is only by the mean, the std or both [bool(mean), bool(std)]
    :param max_iter: maximum number of iterations for Lasso
    :return: a lot of arrays with the various results and different ways of representing the data
    """
    nlambdas = len(lambdas)  # number of lambdas
    p = polydegree[-1]  # the maximum p-value
    method = 4  # OLS method

    # Splitting into train and test, scaling the data
    X_train, X_test, z_train, z_test = fun.split_data(X,
                                                      z,
                                                      test_size=test_size)
    X_train_scaled = fun.scale_X(X_train, scale)
    X_test_scaled = fun.scale_X(X_test, scale)
    X_scaled = fun.scale_X(X, scale)

    # Bootstrap arrays
    bs_error_train = np.zeros((p, nlambdas))
    bs_error_test = np.zeros((p, nlambdas))
    bs_bias = np.zeros((p, nlambdas))
    bs_var = np.zeros((p, nlambdas))

    bs_error_train_opt = np.zeros((p, 2))
    bs_error_test_opt = np.zeros((p, 2))
    bs_bias_opt = np.zeros(
        (p, 2)
    )  # First index is min(MSE) lmb for each p, second at lmb that yields total lowest MSE
    bs_var_opt = np.zeros((p, 2))
    bs_lmb_opt = np.zeros(p)

    # Cross-validation arrays
    cv_error_train = np.zeros((p, nlambdas))
    cv_error_test = np.zeros((p, nlambdas))
    cv_error_train_opt = np.zeros((p, 2))
    cv_error_test_opt = np.zeros((p, 2))
    cv_lmb_opt = np.zeros(p)

    # Setting up regression object to be used for regression (Lasso is dealt with later)
    reg_obj = reg.OrdinaryLeastSquares(method)  # default
    if reg_string == 'SKL':
        reg_obj = skl.LinearRegression()  # Testing with scikit-learn OLS
    elif reg_string == 'Ridge':
        reg_obj = reg.RidgeRegression()

    # Looping over all polynomial degrees in the analysis
    for degree in polydegree:
        n_poly = fun.polynom_N_terms(
            degree
        )  # number of terms in the design matrix for the given degree
        print('p = %2d, np = %3d' % (degree, n_poly))

        # Setting up correct design matrices for the current degree
        X_train_bs = np.zeros((len(X_train_scaled), n_poly))
        X_test_bs = np.zeros((len(X_test_scaled), n_poly))
        X_cv = np.zeros((len(X_scaled), n_poly))

        # Filling the elements up to term n_poly
        X_train_bs[:, :] = X_train_scaled[:, 0:n_poly]
        X_test_bs[:, :] = X_test_scaled[:, 0:n_poly]
        X_cv[:, :] = X_scaled[:, 0:n_poly]

        # Looping over all the lambda values
        for i in range(nlambdas):
            lmb = lambdas[i]  # current lambda value

            # Printing out in order to gauge where we are
            if i % 10 == 0:
                print('i = %d, lmb= %.3e' % (i, lmb))

            # Updating the current lambda value for Ridge and Lasso
            if reg_string == 'Ridge':
                reg_obj.set_lambda(lmb)
            elif reg_string == 'Lasso':
                reg_obj = skl.Lasso(alpha=lmb,
                                    max_iter=max_iter,
                                    precompute=True,
                                    warm_start=True)

            # Bootstrap
            BS = res.Bootstrap(X_train_bs, X_test_bs, z_train, z_test, reg_obj)
            error_, bias_, var_, trainE_ = BS.compute(
                N_bs)  # performing the Bootstrap
            bs_error_test[degree - 1, i] = error_
            bs_bias[degree - 1, i] = bias_
            bs_var[degree - 1, i] = var_
            bs_error_train[degree - 1, i] = trainE_

            # Cross validation
            CV = res.CrossValidation(X_cv, z, reg_obj)
            trainE, testE = CV.compute(K)  # performing the Cross-Validation
            cv_error_train[degree - 1, i] = trainE
            cv_error_test[degree - 1, i] = testE

        # Locating minimum MSE for each polynomial degree
        # Bootstrap
        index_bs = np.argmin(bs_error_test[degree - 1, :])
        bs_lmb_opt[degree - 1] = lambdas[index_bs]
        bs_error_train_opt[:, 0] = bs_error_train[:, index_bs]
        bs_error_test_opt[:, 0] = bs_error_test[:, index_bs]
        bs_bias_opt[:, 0] = bs_bias[:, index_bs]
        bs_var_opt[:, 0] = bs_var[:, index_bs]

        # Cross-validation
        index_cv = np.argmin(cv_error_test[degree - 1, :])
        cv_lmb_opt[degree - 1] = lambdas[index_cv]
        cv_error_train_opt[:, 0] = cv_error_train[:, index_cv]
        cv_error_test_opt[:, 0] = cv_error_test[:, index_cv]

    # Locate minimum MSE  to see how it depends on lambda
    bs_min = np.unravel_index(np.argmin(bs_error_test), bs_error_test.shape)
    cv_min = np.unravel_index(np.argmin(cv_error_test), cv_error_test.shape)
    bs_best = [polydegree[bs_min[0]], lambdas[bs_min[1]]]
    cv_best = [polydegree[cv_min[0]], lambdas[cv_min[1]]]

    # Bootstrap
    bs_error_train_opt[:, 1] = bs_error_train[:, bs_min[1]]
    bs_error_test_opt[:, 1] = bs_error_test[:, bs_min[1]]
    bs_bias_opt[:, 1] = bs_bias[:, bs_min[1]]
    bs_var_opt[:, 1] = bs_var[:, bs_min[1]]

    # Cross-validation
    cv_error_train_opt[:, 1] = cv_error_train[:, cv_min[1]]
    cv_error_test_opt[:, 1] = cv_error_test[:, cv_min[1]]

    # This return is extremely large, sadly, and should have been improved upon
    # this was just the fastest way of doing it when I had to restructure the code
    # so better planning in the future would be a better solution
    return (bs_error_train, bs_error_test, bs_bias, bs_var, bs_error_train_opt,
            bs_error_test_opt, bs_bias_opt, bs_var_opt, bs_lmb_opt,
            cv_error_train, cv_error_test, cv_error_train_opt,
            cv_error_test_opt, cv_lmb_opt, bs_min, bs_best, cv_min, cv_best)

예제 #10

0

파일 보기

    # This return is extremely large, sadly, and should have been improved upon
    # this was just the fastest way of doing it when I had to restructure the code
    # so better planning in the future would be a better solution
    return (bs_error_train, bs_error_test, bs_bias, bs_var, bs_error_train_opt,
            bs_error_test_opt, bs_bias_opt, bs_var_opt, bs_lmb_opt,
            cv_error_train, cv_error_test, cv_error_train_opt,
            cv_error_test_opt, cv_lmb_opt, bs_min, bs_best, cv_min, cv_best)


########################################################################################################################
if run_mode == 'a':
    save = 'N%d_nf%d_p%d_noise%.2f_seed%d' % (N, n_franke, p, noise, seed)

    # Splitting into train and test data
    X_train, X_test, z_train, z_test = fun.split_data(X,
                                                      z_ravel,
                                                      test_size=test_size)
    #    X_train, X_test, z_train, z_test = train_test_split(X, z_ravel, test_size=test_size)

    # Scaling the data
    X_train_scaled = fun.scale_X(X_train, scale)
    X_test_scaled = fun.scale_X(X_test, scale)

    # Plotting the Franke function
    fun.plot_surf(x_mesh,
                  y_mesh,
                  z_mesh,
                  'x',
                  'y',
                  'z',
                  'Franke function, $N$=%d, noise=%.2f' % (N, noise),