Python train_test_split 예제들, sklearn.model_selection._split.train_test_split Python 예제들

예제 #1

0

파일 보기

def fit(j, testing=None, coef=False):
    # Set training data and target data
    X = np.array(data.loc[1:, np.delete(data.columns.values, 0)])
    Y = np.array(data.loc[1:, ['five_star']]).ravel()

    # Assign the training/testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1 - j)

    # Assign the testing set to the new vars if necessary
    if testing is None:
        testing = [X_test, Y_test]

    # Instantiate binary classification logistic regression model
    logreg = linear_model.LogisticRegression(C=1e5)

    # Fit model
    logreg.fit(X_train, Y_train)

    # Set hypothesis and true target data
    h = logreg.predict(testing[0])
    y = testing[1]

    # Return the coefficient matrix if necessary
    if coef:
        return (h, y, error_rate(h, y), logreg.coef_)

    return (h, y, error_rate(h, y))

예제 #2

0

파일 보기

파일: MainScript.py 프로젝트: VoidedIceberg/DS3010_-Crypto_Price_Prediction

def runKNN():
    docs_train, docs_test, y_train, y_test = train_test_split(
        data['tweet'], data['mvmt'], test_size=0.25, random_state=None)

    pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
        ('clf', KNeighborsClassifier(n_neighbors=3)),
    ])

    # Fit the pipeline on the training set using grid search for the parameters
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
    }
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
    grid_search.fit(docs_train, y_train)

    n_candidates = len(grid_search.cv_results_['params'])
    for i in range(n_candidates):
        print(
            i, 'params - %s; mean - %0.2f; std - %0.2f' %
            (grid_search.cv_results_['params'][i],
             grid_search.cv_results_['mean_test_score'][i],
             grid_search.cv_results_['std_test_score'][i]))
    # TASK: Predict the outcome on the testing set and store it in a variable
    # named y_predicted
    y_predicted = grid_search.predict(docs_test)
    # Print the classification report
    print(metrics.classification_report(y_test, y_predicted))
    # Print and plot the confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)
    plt.matshow(cm)
    plt.show()

예제 #3

0

파일 보기

파일: voiceDetermin.py 프로젝트: xyb151158/pythonDataAnalysis

def run_main():

    file_df = pd.read_csv('../dataset/voice.csv')
    #   print file_df
    insect_dataset(file_df)
    #填充空数据
    drop_na(file_df)
    #查看label的个数    分组显示
    #   print file_df['label'].value_counts()
    #特征分布可视化
    fea_name1 = 'meanfun'
    fea_name2 = 'centroid'

    #两个属性的特征图
    # visaulize_two_feature(file_df,fea_name1,fea_name2)

    #艺术性属性的特征图
    # visaulize_single_feature(file_df,fea_name1)

    #多个特征
    fea_name = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label']
    # visaulize_muilt_feature(file_df,fea_name)

    X = file_df.iloc[:, :-1].values
    file_df['label'].replace('male', 0, inplace=True)
    file_df['label'].replace('female', 1, inplace=True)
    y = file_df['label'].values

    #特征归一化
    X = preprocessing.scale(X)

    #分割训练集，测试集
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=1 / 3.,
                                                        random_state=5)

    #选择模型  交叉验证
    cv_scores = []
    k_range = range(1, 31)
    for k in k_range:
        knn = KNeighborsClassifier(k)
        #  print 'knn:',knn
        scores = cross_val_score(knn,
                                 X_train,
                                 y_train,
                                 cv=10,
                                 scoring='accuracy')
        score_mean = scores.mean()
        cv_scores.append(score_mean)
        print '%i:%.4f' % (k, score_mean)

    best_k = np.argmax(cv_scores) + 1

    #训练模型
    knn_model = KNeighborsClassifier(best_k)
    knn_model.fit(X_train, y_train)
    print '测试模型，准确率：', knn_model.score(X_test, y_test)

    return ''

예제 #4

0

파일 보기

파일: cnn.py 프로젝트: wojiazaiyugang/sensor

 def _train(self) -> Model:
     data, label = load_data0_cycle()
     train_data, test_data, train_label, test_label = train_test_split(
         data, label, test_size=0.2)
     train_data = np.reshape(train_data, train_data.shape + (1, ))
     train_label = to_categorical(train_label)
     test_data = np.reshape(test_data, test_data.shape + (1, ))
     test_label = to_categorical(test_label)
     network_input = Input(shape=(8, 200, 1))
     # 如果这里修改了网络结构，记得去下面修改可视化函数里的参数
     network = Conv2D(filters=20, kernel_size=(1, 10))(network_input)
     network = Conv2D(filters=40, kernel_size=(4, 10),
                      activation=tanh)(network)
     network = MaxPool2D((2, 2))(network)
     network = Flatten()(network)
     network = Dense(units=40, activation=tanh)(network)
     network = Dense(units=10, activation=softmax)(network)
     network = Model(inputs=[network_input], outputs=[network])
     network.compile(optimizer=RMSprop(),
                     loss=categorical_crossentropy,
                     metrics=[categorical_accuracy])
     network.summary()
     self.train_history = network.fit(train_data,
                                      train_label,
                                      batch_size=32,
                                      epochs=16)
     self.evaluate_history = network.evaluate(test_data, test_label)
     return network

예제 #5

0

파일 보기

def get_train():
    corpus = get_vec()
    labels = get_labels()

    X_train,X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2)

    return X_train, y_train, X_test, y_test

예제 #6

0

파일 보기

파일: helper_functions.py 프로젝트: FanusArefaine/TCAD-Augmented-Machine-Learning-for-Semiconductor-Device-Failure-Troubleshooting-and-Reverse-Enginee

def base_linear_model(dataframe, device_parameter):

    X_train, X_validation, y_train, y_validation = train_test_split(
        dataframe,
        device_parameter,
        test_size=0.25,
        random_state=42,
        shuffle=True)

    # Fitting the data into linear regression model
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    # Evaluation of base linear regression model
    t_pred = lin_reg.predict(X_train)
    y_pred = lin_reg.predict(X_validation)

    # Calculating mean square error on training and testing
    train_mse = mean_squared_error(y_train, t_pred)
    test_mse = mean_squared_error(y_validation, y_pred)

    print("Training mean squared error: ", train_mse)
    print("Testing mean squared error: ", test_mse)

    # Plotting results of linear regression base model
    fig, ax = plt.subplots()
    ax.scatter(y_pred, y_validation, edgecolors=(0, 0, 1))
    ax.plot([y_validation.min(), y_validation.max()],
            [y_validation.min(), y_validation.max()],
            'r--',
            lw=3)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    plt.show()

예제 #7

0

파일 보기

def process_gsc_data(value):
    if (value == 0):
        fetched_data = extract_human_data_sub()
    else:
        fetched_data = extract_human_data_con()
    fetched_data = pd.DataFrame(fetched_data)
    BigSigma = fetched_data.cov()
    fetched_data = fetched_data.loc[:, (BigSigma != 0).any(axis=0)]
    BigSigma = pd.DataFrame(fetched_data).cov()
    BigSigma = np.diag(np.diag(BigSigma))
    BigSigma_inv = np.linalg.inv(BigSigma)
    fetched_data = fetched_data.values
    print(fetched_data.shape, BigSigma_inv.shape)
    train, test_and_val, train_out, test_and_val_out = train_test_split(
        fetched_data, target, test_size=0.3, shuffle=True)
    train = np.array(train)
    pivot = int(len(test_and_val) / 2)
    test = test_and_val[:pivot]
    val = test_and_val[pivot:]

    test_out = test_and_val_out[:pivot]
    val_out = test_and_val_out[pivot:]
    # print(len(fetched_data))

    return train, test, val, train_out, test_out, val_out, BigSigma_inv

예제 #8

0

파일 보기

 def __init__(self, **kwargs):
     super(BoschChallenge, self).__init__(path=None)
     stream = open("../data/train.yaml", "r")
     files = yaml.load(stream)
     df = pd.DataFrame(files)
     df['path'] = df['path'].apply(lambda x: '../data/' + x[x.find('/'):])
     self.trainData, self.valData = train_test_split(df, test_size=.2)

예제 #9

0

파일 보기

    def _iter_test_indices(self, X, y=None, groups=None):
        n = _num_samples(X)
        index = np.arange(n)

        train_index, test_index = train_test_split(
            index, test_size=self.test_size, random_state=self.random_state)
        yield test_index

예제 #10

0

파일 보기

파일: test_open_babel_ml.py 프로젝트: ipendlet/HomogeneousCo_ML

def test_ml_pipeline():
    'load a test data set, run SVM on it, and plot the predictions vs the actual values'
    data, targets = ReactivityDataLoader().load_mopac_learning()
    regressor = SVR(C=1000)
    trainData, testData, trainTargets, testTargets = train_test_split(data, targets)
    regressor.fit(trainData, trainTargets)
    os.chdir(str(Path.home() / 'Desktop'))
    main.plotScatterPlot(testTargets, regressor.predict(testData), 'predictedVsActual')

예제 #11

0

파일 보기

파일: es3.py 프로젝트: alessioserra/Data_Science_Lab06

 def fit(self, X, y):
     trees = []
     for index in range(self.n_estimators):
         
         tree = DecisionTreeClassifier()
         trees.append(tree)
         X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=int(self.max_features))
         trees[index].fit(X_train, y_train)
         return trees,y_test,X_test

예제 #12

0

파일 보기

def train_boost(booster,
                seed,
                oversampling=-1.0,
                use_tfidf=False,
                enable_cv=False,
                use_alldata=False,
                num_trees=-1):
    train, y, features = prepare_train()
    if use_tfidf:
        print 'Using raw tf-idf sparse matrix ... '
        features = 'auto'
        train_sparse = sparse.csr_matrix(train.values)
        # tfidf_sparse = load_sparse_csr('tfidf_stem_train.npz')
        bm25_sparse = load_sparse_csr('bm25_train.npz')
        # bm25_sparse = bm25_sparse[404290 - 50000:, :]
        # train = sparse.hstack([train_sparse, tfidf_sparse])
        # common_words = load_sparse_csr('train_tfidf_commonwords.npz')
        # symmdif = load_sparse_csr('train_tfidf_symmdiff.npz')
        train = sparse.hstack([train_sparse, bm25_sparse])
        del train_sparse, bm25_sparse
        print 'Train shape: ', train.shape

    if enable_cv:
        train, y = shuffle(train, y)
        booster.cv(train, y)
        exit()

    if use_alldata:
        print 'Using all data to fit classifier ... '
        assert num_trees > 0
        results = booster.fit_all(train, y, num_trees, features)
    else:
        print 'Using train/dev split to fit classifier ... '
        X_train, X_eval, y_train, y_eval = train_test_split(train,
                                                            y,
                                                            stratify=y,
                                                            test_size=0.20,
                                                            random_state=seed)

        if oversampling > 0:
            print 'Oversampling X_train, X_eval datasets ... '
            X_train, y_train = oversample_sparse(X_train,
                                                 y_train,
                                                 p=oversampling)
            X_eval, y_eval = oversample_sparse(X_eval, y_eval, p=oversampling)

        results = booster.fit(X_train, X_eval, y_train, y_eval, features)
        y_pred = booster.predict(X_eval)
        print log_loss(y_eval, y_pred)
        print y_pred

    train = None
    y = None
    del train
    del y

    return results

예제 #13

0

파일 보기

파일: animals_rnn.py 프로젝트: tae0086/tutorial2019

def main():
    samples = load_files("data")

    sequence_dim = 20
    sequence_lag = 1

    samples, labels = make_sequences(samples, sequence_dim, sequence_lag)

    model = Sequential()
    model.add(LSTM(128, input_shape=(sequence_dim, 2), return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(64))
    model.add(Dense(2))

    print(model.summary())

    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(samples,
                                    labels,
                                    test_size=0.15,
                                    random_state=42)

    imname = "animal-11"
    image = cv2.imread("img/{}.jpg".format(imname))
    # create ground truth image with all train gazes
    for j in range(len(trainLabels)):
        s = trainLabels[j]
        cv2.circle(image, (int(s[0]), int(s[1])), 10, (255, 0, 0), 3)
    cv2.imwrite("img/{}_truth.jpg".format(imname), image)

    model.compile(loss="mean_absolute_error",
                  optimizer="adam",
                  metrics=["mae"])

    EPOCHS = 30
    for e in range(EPOCHS):
        print("=" * 50)
        print("Iteration: {}".format(e))
        model.fit(trainSamples,
                  trainLabels,
                  validation_data=(testSamples, testLabels),
                  epochs=1,
                  batch_size=128,
                  verbose=1)

        predictions = model.predict(testSamples)

        # create and save image with all current predictions
        image = cv2.imread("img/{}.jpg".format(imname))
        cv2.line(image, (0, 0), (200, 200), (255, 255, 255), 2)
        for p in predictions:
            cv2.circle(image, (int(p[0]), int(p[1])), 10, (0, 255, 0), 3)
        cv2.imwrite("img/{}_e{:02d}.jpg".format(imname, e), image)

    model.save("model_rnn.h5")

예제 #14

0

파일 보기

파일: CarDetect.py 프로젝트: Z-Drop/CV

def get_SVM_classifier(datas, labels, split_size):
    # 拆分训练数据和测试数据
    x_train, x_test, y_train, y_test = train_test_split(datas,
                                                        labels,
                                                        test_size=split_size)
    # 构建先修SVM对象并训练
    clf = LinearSVC(C=1, loss="hinge").fit(datas, labels)
    print("Score of train datas:{0:.2%}".format(clf.score(x_train, y_train)))
    print("Score of train datas(split_size:{0}):{1:.2%}".format(
        split_size, clf.score(x_test, y_test)))
    return clf

예제 #15

0

파일 보기

파일: loc2lang.py 프로젝트: afcarl/gaussianae

def train(args, **kwargs):
    n_gaus_comp = args.ncomp
    kmeans_mu = kwargs.get('kmeans', False)
    X_train = toy_data(n_samples=10000)

    X_train, X_test, Y_train, Y_test = train_test_split(X_train,
                                                        X_train,
                                                        test_size=0.2,
                                                        random_state=1)
    X_train, X_dev, Y_train, Y_dev = train_test_split(X_train,
                                                      Y_train,
                                                      test_size=0.2,
                                                      random_state=1)

    input_size = X_train.shape[1]
    output_size = X_train.shape[1]
    batch_size = 1000

    mus = np.random.randn(n_gaus_comp, 2).astype('float32')
    #mus = X_train[0:n_gaus_comp]
    raw_stds = None
    raw_cors = None

    model = NNModel(n_epochs=100000,
                    batch_size=batch_size,
                    input_size=input_size,
                    output_size=output_size,
                    early_stopping_max_down=10,
                    n_gaus_comp=n_gaus_comp,
                    mus=mus,
                    sigmas=raw_stds,
                    corxy=raw_cors)
    model.build()
    model.fit(X_train, Y_train, X_dev, Y_dev, X_test, Y_test)
    mus_eval, sigmas_eval, corxy_eval, pis_eval = model.f_predict(X_dev)
    mus_eval, sigmas_eval, corxy_eval, pis_eval = np.asarray(
        mus_eval), np.asarray(sigmas_eval), np.asarray(corxy_eval), np.asarray(
            pis_eval)
    logging.info(mus_eval)
    logging.info(sigmas_eval)
    pdb.set_trace()

예제 #16

0

파일 보기

def main():
    df = pd.read_csv("data/titanic.csv")
    print(df)
    df.drop(['Name'], axis=1, inplace=True)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    print(df)

    # print("probabilities of survival:")
    # for col in df.columns[1:]:
    #     df2 = pd.crosstab(values=df.index, index=df['Survived'], columns=df[col], aggfunc='count')
    #     print(df2)

    features = list(df.columns[1:])
    labels = ['Survived', 'Not Survived']
    data = df[df.columns[1:]].values.tolist()
    target = list(df['Survived'].map({True: 1, False: 0}))
    print(len(features), "Features: ", features)
    print(len(data), 'Data: ', data)
    print(len(target), 'Target: ', target)

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Accuracy of random forest model: ",
          metrics.accuracy_score(y_test, y_pred))

    feature_imp = pd.Series(clf.feature_importances_,
                            index=features).sort_values(ascending=True)
    #print(feature_imp)

    # Creating a bar plot
    sns.barplot(x=feature_imp, y=feature_imp.index)

    # Add labels to graph
    plt.xlabel('Feature Importance Score by Random Forest')
    plt.ylabel('Features')
    plt.legend()
    plt.show()

    # Predict probabilities for the test data.
    probs = clf.predict_proba(data)
    # Keep Probabilities of the positive class only.
    probs = probs[:, 1]
    # Compute the AUC Score.
    auc = roc_auc_score(target, probs)
    print('AUC: %.2f' % auc)

    # fpr, tpr, thresholds = roc_curve(y_test, probs)
    plot_roc_curve(clf, data, target)
    plt.show()

예제 #17

0

파일 보기

파일: util.py 프로젝트: ecepep/kaggle_petfinder

def check_generalization(pipe,
                         metric,
                         X,
                         y,
                         test_size=0.2,
                         dishonnest_validation_mlp=False):
    '''
    Check for bad generalization to avoid overfit of a pipe param
    :param pipe:
    :param metric: scoring method 
    :param X: train and test
    :param y: train and test
    :warning :param dishonnest if True small leakage 
    '''
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size)
    #     print([i[0] for i in pipe.steps])
    #     print([i[0] for i in pipe.named_steps.u_prep.transformer_list])

    if dishonnest_validation_mlp:
        assert hasattr(
            pipe.named_steps.clf, "validation"
        ), "no attribute validation, pipe's clf is no instance of CustomNNCategorical"
        prep_val_pipe = deepcopy(pipe)
        prep_val_pipe.steps = prep_val_pipe.steps[:
                                                  -1]  # only keep the preprocessing
        prep_val_pipe = prep_val_pipe.fit(x_train)  #, y_train
        x_val = prep_val_pipe.transform(x_test)
        pipe.named_steps.clf.validation = (x_val, y_test)

    pipe.steps = pipe.steps[:-1] + [
        ("dim_print", DimPrinter())
    ] + pipe.steps[-1:]  # add a print of X dimension

    pipe.fit(x_train, y_train)
    pred_train = pipe.predict(x_train)
    pred_test = pipe.predict(x_test)

    score_train = metric(pred_train, y_train)
    score_test = metric(pred_test, y_test)

    gen = {
        "score_train": score_train,
        "score_test": score_test,
        "fitted_pipe": pipe
    }
    return gen

예제 #18

0

파일 보기

파일: dataset_split.py 프로젝트: yizhishagua/circRNA-disease-project4

 def preprocessing(self, file_name, column_num, ratio, random_state):
     '''
     对划分后的样本进行标准化
     按指定比例和指定随机状态进行样本划分
     :param file_name: 要进行处理的sklearn初始样本
     :param column_num: 要进行分割的列号
     :param ratio: 训练集和测试集的比例
     :param random_state: 划分训练集测试集的随机状态
     :return:
     '''
     data = np.loadtxt(file_name, dtype=str, delimiter=' ')
     # 将txt文件按第column_num, 进行列分割
     y, x = np.split(data, [column_num], axis=1)
     x_train, x_test, y_train, y_test = train_test_split(
         x, y, random_state=random_state, train_size=ratio)
     return x, x_train, x_test, y, y_train, y_test

예제 #19

0

파일 보기

def train():
    X = []
    Y = []
    X1, Y1 = loadSample("ss.txt", 0)
    X2, Y2 = loadSample("good.txt", 1)
    X = np.array(X1 + X2)
    Y = np.array(Y1 + Y2)
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        random_state=1,
                                                        train_size=0.8)
    clf = svm.SVC(C=30, kernel='rbf', gamma=2.5, decision_function_shape='ovr')
    rf = clf.fit(x_train, y_train.ravel())

    joblib.dump(rf, 'rf.model')  # 保存模型文件

    print "训练集正确率:", clf.score(x_train, y_train)  # 训练集正确率
    print "测试集正确率:", clf.score(x_test, y_test)  # 测试集正确率

예제 #20

0

파일 보기

 def setUpClass(cls):
     # Classification use-case
     cls.X_c, cls.y_c = make_moons(1000, noise=0.5)
     cls.X_c = pd.DataFrame(cls.X_c, columns=['F1', 'F2'])
     cls.target_names = ['class 0', 'class 1']
     cls.X_train_c, cls.X_test_c, cls.y_train_c, cls.y_test_c = train_test_split(
         cls.X_c, cls.y_c)
     cls.classifier_est = DecisionTreeClassifier(max_depth=5,
                                                 random_state=5)
     cls.classifier_est.fit(cls.X_train_c, cls.y_train_c)
     cls.interpreter = Interpretation(cls.X_train_c,
                                      feature_names=cls.X_c.columns)
     cls.model_inst = InMemoryModel(cls.classifier_est.predict,
                                    examples=cls.X_train_c,
                                    model_type='classifier',
                                    unique_values=[0, 1],
                                    feature_names=cls.X_c.columns,
                                    target_names=cls.target_names,
                                    log_level=_INFO)

예제 #21

0

파일 보기

파일: test_regularization.py 프로젝트: skdom6/scikit-fda

    def test_multivariate(self):

        def ignore_scalar_warning():
            warnings.filterwarnings(
                "ignore", category=UserWarning,
                message="All the covariates are scalar.")

        X, y = make_regression(n_samples=20, n_features=10,
                               random_state=1, bias=3.5)

        X_train, X_test, y_train, _ = train_test_split(
            X, y, random_state=2)

        for regularization_parameter in [0, 1, 10, 100]:

            with self.subTest(
                    regularization_parameter=regularization_parameter):

                sklearn_l2 = Ridge(alpha=regularization_parameter)
                skfda_l2 = LinearRegression(
                    regularization=L2Regularization(
                        regularization_parameter=regularization_parameter),
                )

                sklearn_l2.fit(X_train, y_train)
                with warnings.catch_warnings():
                    ignore_scalar_warning()
                    skfda_l2.fit(X_train, y_train)

                sklearn_y_pred = sklearn_l2.predict(X_test)
                with warnings.catch_warnings():
                    ignore_scalar_warning()
                    skfda_y_pred = skfda_l2.predict(X_test)

                np.testing.assert_allclose(
                    sklearn_l2.coef_, skfda_l2.coef_[0])

                np.testing.assert_allclose(
                    sklearn_l2.intercept_, skfda_l2.intercept_)

                np.testing.assert_allclose(
                    sklearn_y_pred, skfda_y_pred)

예제 #22

0

파일 보기

파일: dmproj.py 프로젝트: pavankumarsdsu/Predict-which-Airport-is-better-among-SFO-and-OAK

def driver():
    dataset = build()
    delaylist = [
        'ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
        'SecurityDelay', 'LateAircraftDelay'
    ]
    #plotStats(dataset, plotlist1, 'SFO')
    #print(dataset.columns.tolist())

    dataset = dataset.reset_index()
    dataset.fillna(0)
    #Converting categorical features to numerics
    dataset["Dest"] = dataset["Dest"].astype('category')
    dataset["Dest"] = dataset["Dest"].cat.codes

    #dataset = dataset.sample(n=20000)

    dataset['Date'] = dataset['Date'].apply(lambda x: x.timestamp())
    dataSFO = dataset.loc[dataset['Origin'].isin(['SFO'])]
    dataOAK = dataset.loc[dataset['Origin'].isin(['OAK'])]
    dataSFO = dataSFO.iloc[0:10000]
    dataOAK = dataOAK.iloc[0:10000]
    frames = [dataSFO, dataOAK]
    NNdata = pd.concat(frames)
    #NNdata = NNdata.sample(n=20000)
    labels = NNdata["Origin"]
    NNdata.drop('Origin', axis=1, inplace=True)

    delayset = dataset[delaylist]

    c1 = dataset.DayOfWeek.unique()

    #labels = dataset["Origin"]
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    labels = np_utils.to_categorical(labels, 2)
    data = NNdata
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        train_size=0.8)

    FeedForward(x_train, x_test, y_train, y_test, len(NNdata.dtypes))

예제 #23

0

파일 보기

def process_gsc__data_con():
    concatenated_data = extract_gsc_data_con()
    concatenated_data = pd.DataFrame(concatenated_data)
    BigSigma = concatenated_data.cov()
    concatenated_data = concatenated_data.loc[:, (BigSigma != 0).any(axis=0)]
    BigSigma = pd.DataFrame(concatenated_data).cov()
    BigSigma = np.diag(np.diag(BigSigma))
    BigSigma_inv = np.linalg.inv(BigSigma)
    concatenated_data = concatenated_data.values
    print(concatenated_data.shape, BigSigma_inv.shape)
    train, test_and_val, train_out, test_and_val_out = train_test_split(
        concatenated_data, target, test_size=0.3, shuffle=True)
    train = np.array(train)
    pivot = int(len(test_and_val) / 2)
    test = test_and_val[:pivot]
    val = test_and_val[pivot:]

    test_out = test_and_val_out[:pivot]
    val_out = test_and_val_out[pivot:]
    # # print(len(concatenated_data))

    return train, test, val, train_out, test_out, val_out, BigSigma_inv

예제 #24

0

파일 보기

파일: prepare_data.py 프로젝트: yongruihuang/Eumpy-experiment

def load_data(extend_disgust):
    '''
    extract data from 'fer2013.csv' file
    
    extend_digust: whether to extend disgust class
    
    return: numpy array -like
        train_X:       shape(?,48,48)
        validation_X:  shape(?,48,48) 
        train_y:       shape(?, )
        validation_y:  shape(?, )
    '''
    
    data = pd.read_csv("../../dataset/fer2013/fer2013.csv")
    
    X = []
    y = []
    for (pixels, emotion) in zip(data['pixels'], data['emotion']):
        #if emotion == 0 or emotion == 1 or emotion == 2:
        #   continue
        img = np.array((pixels.split(' ')), dtype=uint8 )
        img = img.reshape((48, 48))
        #img = cv2.equalizeHist(img)
        y.append(emotion)
        X.append(img)
    
    if extend_disgust:
        #extend disgust facial expression data, inorder to overcome the problem that class 'digust' has much less sample than other class.
        disgust_image = np.load('../../dataset/fer2013/extend_disgust.npy')
        X.extend(disgust_image)
        y.extend(np.ones((len(disgust_image),)))
    
    X = np.array(X, dtype=uint8)
    y = np.array(y, dtype=uint8)
    X = X.astype('float32')
    train_X, validation_X, train_y, validation_y = \
    train_test_split(X, y, test_size=0.2, random_state = 0)
    
    return train_X, validation_X, train_y, validation_y

예제 #25

0

파일 보기

파일: MainScript.py 프로젝트: VoidedIceberg/DS3010_-Crypto_Price_Prediction

def runTheLinearSVC():
    docs_train, docs_test, y_train, y_test = train_test_split(
        data['tweet'], data['mvmt'], test_size=0.25, random_state=None)

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
        ('clf', LinearSVC(C=1000)),
    ])

    # TASK: Build a grid search to find out whether unigrams or bigrams are
    # more useful.
    # Fit the pipeline on the training set using grid search for the parameters
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
    }
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
    grid_search.fit(docs_train, y_train)

    # TASK: print the mean and std for each candidate along with the parameter
    # settings for all the candidates explored by grid search.
    n_candidates = len(grid_search.cv_results_['params'])
    for i in range(n_candidates):
        print(
            i, 'params - %s; mean - %0.2f; std - %0.2f' %
            (grid_search.cv_results_['params'][i],
             grid_search.cv_results_['mean_test_score'][i],
             grid_search.cv_results_['std_test_score'][i]))
    # TASK: Predict the outcome on the testing set and store it in a variable
    # named y_predicted
    y_predicted = grid_search.predict(docs_test)
    # Print the classification report
    print(metrics.classification_report(y_test, y_predicted))
    # Print and plot the confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)
    plt.matshow(cm)
    plt.show()

예제 #26

0

파일 보기

    def fit2(self, X, y, feature_names):
        X_train, X_eval, y_train, y_eval = train_test_split(
            X,
            y,
            stratify=y,
            test_size=0.20,
            random_state=np.random.randint(50, 1000))
        print self.params
        print 'LightGBM: training ... '
        eval_result = {}
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)
        self.gbm = lgb.train(
            self.params,
            lgb_train,
            num_boost_round=self.params['n_estimators'],
            valid_sets=[lgb_train, lgb_eval],
            verbose_eval=self.params['verbose_eval'],
            evals_result=eval_result,
            early_stopping_rounds=self.params['early_stopping_rounds'],
            feature_name=feature_names)

        return self.gbm, eval_result

예제 #27

0

파일 보기

파일: run_demo_classification.py 프로젝트: tmadl/sklearn-interpretable-tree

import matplotlib.pyplot as plt, numpy as np
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier
from InterpretableDecisionTreeClassifier import IDecisionTreeClassifier
from treeutils import tree_to_code
from sklearn.model_selection._split import train_test_split
from sklearn.metrics import f1_score

X, y = make_moons(300, noise=0.4)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

clf1 = DecisionTreeClassifier(max_depth=4).fit(Xtrain, ytrain)
clf2 = IDecisionTreeClassifier(max_depth=4).fit(Xtrain, ytrain)

print("=== original decision tree ===")
features = ["ft" + str(i) for i in range(X.shape[1])]
print(tree_to_code(clf1, features))  # output large tree
print("=== simplified (interpretable) decision tree ===")
print(tree_to_code(clf2, features))

h = 0.02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

plt.subplot(1, 2, 1)
plt.title("original decision tree. F1: " +
          str(f1_score(ytest, clf1.predict(Xtest))))
Z = clf1.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=.8)

예제 #28

0

파일 보기

def train_and_test(alpha,
                   predictors,
                   predictor_params,
                   x_filename,
                   y_filename,
                   n_users,
                   percTest,
                   featureset_to_use,
                   diff_weighting,
                   phi,
                   force_balanced_classes,
                   do_scaling,
                   optimise_predictors,
                   report,
                   conf_report=None):
    # all_X = numpy.loadtxt(x_filename, delimiter=",")
    all_X = numpy.load(x_filename + ".npy")
    all_y = numpy.loadtxt(y_filename, delimiter=",")

    print("loaded X and y files", x_filename, y_filename)

    if numpy.isnan(all_X.any()):
        print("nan in", x_filename)
        exit()

    if numpy.isnan(all_y.any()):
        print("nan in", y_filename)
        exit()

    #print("selecting balanced subsample")
    print("t t split")
    X_train, X_test, y_train, y_test = train_test_split(all_X,
                                                        all_y,
                                                        test_size=percTest,
                                                        random_state=666)

    # feature extraction
    # test = SelectKBest(score_func=chi2, k=100)
    # kb = test.fit(X_train, y_train)
    # # summarize scores
    # numpy.set_printoptions(precision=3)
    # print(kb.scores_)
    # features = kb.transform(X_train)
    # mask = kb.get_support()
    # # summarize selected features
    # print(features.shape)
    # X_train = X_train[:,mask]
    # X_test = X_test[:,mask]

    scaler = StandardScaler()
    rdim = FeatureAgglomeration(n_clusters=100)
    if do_scaling:
        # input(X_train.shape)
        X_train = rdim.fit_transform(X_train)
        X_test = rdim.transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        with open('../../../isaac_data_files/qutor_scaler.pkl',
                  'wb') as output:
            pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
        with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output:
            pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL)

    # print("feature reduction...")
    # pc = PCA(n_components=100)
    # X_train = pc.fit_transform(X_train)
    # X_test = pc.transform(X_test)

    classes = numpy.unique(y_train)
    sample_weights = None
    if (force_balanced_classes):
        X_train, y_train = balanced_subsample(X_train, y_train, 1.0)  #0.118)

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    print("tuning classifier ...")
    for ix, p in enumerate(predictors):
        print(type(p))
        print(p.get_params().keys())

        if optimise_predictors == True and len(predictor_params[ix]) > 1:
            pbest = run_random_search(p, X_train, y_train,
                                      predictor_params[ix])
        else:
            pbest = p.fit(X_train, y_train)
        predictors[ix] = pbest

    print("pickling classifier ...")
    for ix, p in enumerate(predictors):
        p_name = predictor_params[ix]['name']
        with open(
                '../../../isaac_data_files/p_{}_{}_{}.pkl'.format(
                    p_name, alpha, phi), 'wb') as output:
            pickle.dump(p, output, pickle.HIGHEST_PROTOCOL)
    print("done!")

    # report.write("* ** *** |\| \` | |  |) /; `|` / |_| *** ** *\n")
    # report.write("* ** *** | | /_ |^|  |) ||  |  \ | | *** ** *\n")
    #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n")
    for ix, p in enumerate(predictors):

        report.write(",".join(
            map(str, (all_X.shape[0], str(p).replace(",", ";").replace(
                "\n", ""), force_balanced_classes, diff_weighting, alpha, phi,
                      do_scaling))))

        y_pred_tr = p.predict(X_train)
        y_pred = p.predict(X_test)

        # for x,y,yp in zip(X_train, y_test, y_pred):

        if conf_report:
            conf_report.write(
                str(p).replace(",", ";").replace("\n", "") + "\n")
            conf_report.write(str(alpha) + "," + str(phi) + "\n")
            conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n")
            conf_report.write("\n")
        # p = precision_score(y_test, y_pred, average=None, labels=classes)
        # r = recall_score(y_test, y_pred, average=None, labels=classes)
        # F = f1_score(y_test, y_pred, average=None, labels=classes)
        p, r, F, s = precision_recall_fscore_support(y_test,
                                                     y_pred,
                                                     labels=classes,
                                                     average=None,
                                                     warn_for=('precision',
                                                               'recall',
                                                               'f-score'))
        avp, avr, avF, _ = precision_recall_fscore_support(
            y_test,
            y_pred,
            labels=classes,
            average='weighted',
            warn_for=('precision', 'recall', 'f-score'))
        for ix, c in enumerate(classes):
            report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix],
                                                   s[ix]))
        report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s)))

        # report.write(classification_report(y_test, y_pred)+"\n")
        # report.write("------END OF CLASSIFIER------\n")
        report.flush()
    return X_train, X_test, y_pred_tr, y_pred, y_test, scaler

예제 #29

0

파일 보기

import numpy as np
from nltk.chunk.util import accuracy

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

data = pd.read_csv('../testdata/weather.csv'
                   )  # RainTomorrow 종속, yes/no를 숫자로 변경시켜야함.   나머지열 독립
# print(data)  # (366, 12)
data2 = pd.DataFrame()
data2 = data.drop(['Date', 'RainToday'], axis=1)  # 제외할 칼럼
data2.RainTomorrow = data2.RainTomorrow.map({'Yes': 1, 'No': 0})
print(data2)

# train / test dataset으로 분리 (overfitting 과적합 방지)
train, test = train_test_split(data2, test_size=0.3,
                               random_state=52)  #random_state == random.seed()
print(data2.shape, train.shape, test.shape)  # (366, 10) (256, 10) (110, 10)

# 분류 모델
col_select = " + ".join(train.columns.difference(['RainTomorrow']))
my_formula = 'RainTomorrow ~ ' + col_select
# model = smf.glm(formula=my_formula, data=train, family=sm.families.Binomial()).fit()  # 분류를 위한 학습모델 생성
model = smf.logit(formula=my_formula, data=train).fit()

print(model.summary())  # P>|z| 0.05보다 큰 변수들은 독립변수로서 부적절하다고 볼 수 있다.
# print(model.params.values)
print('예측값  :', np.rint(model.predict(test)[:10].values))
print('실제값  :', test.RainTomorrow[:10].values)

# 분류 정확도
from sklearn.metrics import accuracy_score

예제 #30

0

파일 보기

파일: navie02iris.py 프로젝트: yjin982/Study

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

iris = datasets.load_iris()
print(iris.keys())

# ===========================================================
x = iris.data[:, [2, 3]] # 모든 행의 2,3열, petal.length, petal.width의 2 칼럼으로 꽃의 종류 3가지로 분류
y = iris.target
print(x[:3])
print(y[:3], '   ', set(y))

# train / test 나누기
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) #(105, 2) (45, 2) (105,) (45,)

# (지금은 꼭 필요한 건 아닌지만 필요할 때가 있는) 스케일링 (데이터 크기 표준화, 전체 자료의 분포를 평균 0, 분산 1이 되도록)
print(x_train[:3])  # [[3.5 1. ]  [5.5 1.8]  [5.7 2.5]]
print(x_test[:3])   # [[5.1 2.4]  [4.  1. ]  [1.4 0.2]]

sc = StandardScaler()
sc.fit(x_train)
sc.fit(x_test)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)

print(x_train[:3]) # [[-0.05624622 -0.18650096]   [ 1.14902997  0.93250481]   [ 1.26955759  1.91163486]]
print(x_test[:3])  # [[ 0.90797473  1.77175914]   [ 0.24507283 -0.18650096]   [-1.32178623 -1.30550673]]