def train_test_split_mock_pandas():
    # X mock dataframe
    X_df = MockDataFrame(X)
    X_train, X_test = train_test_split(X_df)
    assert_true(isinstance(X_train, MockDataFrame))
    assert_true(isinstance(X_test, MockDataFrame))
    X_train_arr, X_test_arr = train_test_split(X_df)
def main(_):

    if FLAGS.dataset == 'cifar10':
        (X_train, y_train), (_, _) = cifar10.load_data()
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    else:
        with open('data/train.p', mode='rb') as f:
            train = pickle.load(f)
        X_train, X_val, y_train, y_val = train_test_split(train['features'], train['labels'], test_size=0.33, random_state=0)

    train_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_train')
    validation_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_validation')

    print("Resizing to", (w, h, ch))
    print("Saving to ...")
    print(train_output_file)
    print(validation_output_file)

    with tf.Session() as sess:
        K.set_session(sess)
        K.set_learning_phase(1)

        model = create_model()

        print('Bottleneck training')
        train_gen = gen(sess, X_train, y_train, batch_size)
        bottleneck_features_train = model.predict_generator(train_gen(), X_train.shape[0])
        data = {'features': bottleneck_features_train, 'labels': y_train}
        pickle.dump(data, open(train_output_file, 'wb'))

        print('Bottleneck validation')
        val_gen = gen(sess, X_val, y_val, batch_size)
        bottleneck_features_validation = model.predict_generator(val_gen(), X_val.shape[0])
        data = {'features': bottleneck_features_validation, 'labels': y_val}
        pickle.dump(data, open(validation_output_file, 'wb'))
示例#3
0
def test_base_estimator():
    # Check base_estimator and its default values.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng)

    ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))

    ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))

    ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, Perceptron))

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)

    ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))

    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))

    ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
    assert_true(isinstance(ensemble.base_estimator_, SVR))
示例#4
0
    def test_classification_with_validation(self):
        tol_places = 4
        data_x, data_y = make_classification(n_samples=100, n_features=7,
                                             n_redundant=0, n_informative=7,
                                             n_clusters_per_class=2,
                                             random_state=3227)
        label_y = np.where(data_y == 0, 'A', 'B')

        train_x, test_x, train_y, test_y = train_test_split(data_x, label_y,
                                                            test_size=0.25,
                                                            random_state=3227)

        train_x, validate_x, train_y, validate_y = train_test_split(
            train_x, train_y, test_size=0.5, random_state=3227)

        params = {
            'ref_functions': ('linear_cov',),
            'criterion_type': 'bias_retrain',
            'criterion_minimum_width': 5,
            'max_layer_count': 5,
            'verbose': 0,
            'n_jobs': 'max'
        }
        model = Classifier(**params)
        model.fit(train_x, train_y, validation_data=(validate_x, validate_y))
        pred_y = model.predict_proba(test_x)
        roc_auc = roc_auc_score(model.le.transform(test_y), pred_y)
        self.assertAlmostEqual(roc_auc, 0.76, places=tol_places)

        no1 = model.predict_neuron_output(test_x, 0, 0)
        no2 = model.predict_neuron_output(test_x, 1, 0)
示例#5
0
def read(d):
    data = pd.read_table(path+uni+"_"+d+".txt",delimiter='\t')
    data['label'] = 0
    for i in range(len(data.index)):
        if data.iloc[i,3]<1000:
            data.iloc[i,len(data.columns)-1]=1
        else:
            data.iloc[i,len(data.columns)-1]=0
    X_0 = data.iloc[:,7:len(data.columns)-1]
    y_0 = data.iloc[:,len(data.columns)-1]    
    X_0,X_,y_0,y_ = train_test_split(X_0,y_0,test_size=0.0,random_state=3421)
    X_1,X_test,y_1,y_test = train_test_split(X_0,y_0,test_size=0.2,random_state=1257)
    X_2,X_3,y_2,y_3 = train_test_split(X_1,y_1,test_size=1-label_rate,random_state=11)

##############  整体预测与交互检验  ###########
#    scores_all = cross_val_score(RandomForestClassifier(n_estimators=500), X_1, y_1, cv=5, scoring='accuracy')
#    score_all_mean =scores_all.mean()
#    print(d+'5折交互检验:'+str(score_all_mean))
#    rf_all = RandomForestClassifier(n_estimators=500).fit(X_1,y_1)
#    answer_rf_all = rf_all.predict(X_test)
#    accuracy_all = metrics.accuracy_score(y_test,answer_rf_all)
#    print(d+'整体预测:'+str(accuracy_all))
################################################
    
    return data,X_2,y_2,X_3,y_3,X_test,y_test
示例#6
0
def reduce_dataset(uid):
    ds = load_validation_dataframe(uid)
    X_train, X_valid, X_test, y_train, y_valid, y_test = ds

    X=pd.concat((X_train,X_valid,X_test))
    y=np.concatenate((y_train,y_valid,y_test))

    if len(y) > 5000:
        neg_inds = [i for i, v in enumerate(y) if v==0]
        pos_inds = [i for i, v in enumerate(y) if v==1]

        n_neg = 5000 - len(pos_inds)
        neg_inds = sample(neg_inds, n_neg)
        inds = sorted(neg_inds + pos_inds)
        X = X.iloc[inds,:]
        y = y[inds]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.66666, random_state=42)

    Xtrain_fname = join(DATAFRAMES_FOLDER, "dfXtrain_%d_small.pickle" % uid)
    Xvalid_fname = join(DATAFRAMES_FOLDER, "dfXvalid_%d_small.pickle" % uid)
    Xtest_fname = join(DATAFRAMES_FOLDER, "dfXtestv_%d_small.pickle" % uid)
    ys_fname = join(DATAFRAMES_FOLDER, "ysv_%d_small.pickle" % uid)

    X_train.to_pickle(Xtrain_fname)
    X_valid.to_pickle(Xvalid_fname)
    X_test.to_pickle(Xtest_fname)
    pickle.dump((y_train, y_valid, y_test), open(ys_fname, 'wb'))

    return X_train, X_valid, X_test, y_train, y_valid, y_test
示例#7
0
def stacking():
    X_train,X_test,Y_train,Y_test =train_test_split(x,y,
                                                        random_state=35,
                                                        test_size=0.2)
    x1_test =np.zeros((X_test.shape[0],len(classifiers)))#存储第一层测试集的输出结果
    x1_train =np.zeros((X_train.shape[0],len(classifiers)))
    print 'x1.shape',np.shape(x1_train)
    print 'y....',np.shape(Y_train)
    accuracy = np.zeros(len(classifiers))#每个模型的准确率
    for train_index, test_index in sss.split(X_train, Y_train):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf_num = 0
        for clf in classifiers:
            clf_name = clf.__class__.__name__
            clf.fit(x_train, y_train)
            x1_train[test_index,clf_num]=clf.predict(x_test)#下层模型的训练集输入是上层模型对于对应测试集的预测输出
            x1_test[:, clf_num] += clf.predict(X_test)#直接对测试集进行预测,总共有十次,进行平均
            accuracy[clf_num] += (y_test == x1_train[test_index,clf_num]).mean()#该模型的准确率,十次平均
            clf_num += 1


    print np.shape(x1_train)
    print np.shape(y_train)
    x2_train,x2_test,y2_train,y2_test =train_test_split(x1_train,Y_train,test_size=0.1)
    lr =LogisticRegression()
    lr.fit(x2_train,y2_train)
    print lr.predict(x1_test)
    print Y_test
示例#8
0
	def learning( self):

		X = self.X
		y = self.y
		print( "Shape of X and y are", X.shape, y.shape)

		X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
			test_size=0.2, random_state=42)
		X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
														  test_size=0.2, random_state=42)

		val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val,
														early_stopping_rounds=200)
		model = skflow.TensorFlowDNNRegressor(hidden_units=[100, 50, 10], steps=5000)
		model.fit(X_train, y_train, val_monitor)

		yP = model.predict(X_test)
		score_r2 = metrics.r2_score(y_test, yP)
		score_MedAE = metrics.median_absolute_error(y_test, yP)
		print('Accuracy')
		print('--------')
		print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE))

		if self.graph:
			kutil.regress_show4( y_test, yP)
示例#9
0
    def __init__(self, root, train=True, val=False, color_space='lab', transform=None, test_size=0.9, val_size=0.125, location='cpu'):
        """
            color_space: 'yub' or 'lab'
        """
        self.root_dir = root
        all_files = []
        for r, _, files in walk(self.root_dir):
            for f in files:
                if f.endswith('.jpg'):
                    all_files.append(join(r, f))
        train_val_files, test_files = train_test_split(
            all_files, test_size=test_size, random_state=69)
        train_files, val_files = train_test_split(train_val_files,
                                                  test_size=val_size, random_state=69)
        if (train and val):
            self.filenames = val_files
        elif train:
            self.filenames = train_files
        else:
            self.filenames = test_files

        self.color_space = color_space
        if (self.color_space not in ['rgb', 'lab']):
            raise(NotImplementedError)
        self.transform = transform
        self.location = location
        self.nnenc = NNEncode(location=self.location)
        self.train = train
示例#10
0
def lda_tuner(ingroup_otu, best_models):

    best_score = -1*np.inf
    dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    topic_series = [3]
    X = ingroup_otu.values
    eval_counter = 0

    for topics in topic_series: 
        for dtp in dtp_series:
            for twp in twp_series:
                eval_counter +=1
                X_train, X_test = train_test_split(X, test_size=0.5)
                lda = LatentDirichletAllocation(n_topics=topics, 
                                                doc_topic_prior=dtp, 
                                                topic_word_prior=twp, 
                                                learning_method='batch',
                                                random_state=42,
                                                max_iter=20)
                lda.fit(X_train)
                this_score = lda.score(X_test)
                this_perplexity = lda.perplexity(X_test)
                if this_score > best_score:
                    best_score = this_score
                    print "New Max Likelihood: {}".format(best_score)

                print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, 
                                                                 topics, dtp, twp,
                                                                 this_score, this_perplexity)

                best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
                                    'score': this_score, 'perp': this_perplexity})
                if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
                    eval_counter +=1
                    X_train, X_test = train_test_split(X, test_size=0.5)
                    lda = LatentDirichletAllocation(n_topics=topics, 
                                                    doc_topic_prior=1./topics, 
                                                    topic_word_prior=1./topics, 
                                                    learning_method='batch',
                                                    random_state=42,
                                                    max_iter=20)
                    lda.fit(X_train)
                    this_score = lda.score(X_test)
                    this_perplexity = lda.perplexity(X_test)
                    if this_score > best_score:
                        best_score = this_score
                        print "New Max Likelihood: {}".format(best_score)

                    print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, 
                                                                                topics, 
                                                                                (1./topics), 
                                                                                (1./topics),
                                                                                this_score,
                                                                                this_perplexity)

                    best_models.append({'n': topics, 'dtp': (1./topics), 
                                        'twp': (1./topics), 'score': this_score,
                                        'perp': this_perplexity})
    return best_models
def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
示例#12
0
def split_data(data):
    X_train, X_test, Y_train, Y_test = train_test_split(data.loc[:, data.columns != label], data[label],
                                                        train_size=train_size + validation_size, test_size=test_size,
                                                        shuffle=False, random_state=0)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
                                                      train_size=train_size / (train_size + validation_size),
                                                      test_size=validation_size / (train_size + validation_size),
                                                      shuffle=False, random_state=0)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test
def get_train_valid_test_split(n, train=0.7, valid=0.1, test=0.2, shuffle=False):
    other_split = valid+test
    if train+other_split!=1:
        raise ValueError("Train, Valid, Test splits should sum to 1")
    train_set, other_set = train_test_split(range(1,n+1), 
                                            train_size=train, test_size=other_split, shuffle=shuffle)
    valid_set, test_set = train_test_split(other_set, 
                                           train_size=valid/other_split, 
                                           test_size=test/other_split,
                                           shuffle=False)
    print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set)))
    return train_set, valid_set, test_set
def preprocess(data, test_size, sample=None, scale=True):

    data_frame_all = pandas.read_table(data)
    df = data_frame_all

    # for simplicity for now--and since only 11093 or <3 % of our data, we're just gonna drop those rows
    no_null_df = df.dropna(axis=0, how='any')

    # this shows us that we no longer have null values
    no_null_df.isnull().values.any()

    # let's rename our new data frame df again.  we're left with 238907 rows
    df = no_null_df
    df_unprocessed = df

    if sample:
        df = df.sample(frac=sample)
        print("sampled")

    df = df[['order_estimated_driving_time_min','order_estimated_shopping_time_min']]
    df['total_time_min'] = df.sum(axis=1)
    df['time_in_hours'] = df.total_time_min.divide(60)


    target = df.time_in_hours * 15
    df = df.drop(['time_in_hours', 'total_time_min'], axis=1)


    s1 = target.std()
    s2 = 7.5 #our chosen std deviation

    m1 = target.mean()
    m2 = 15 #our chosen mean

    target = m2 + (target - m1) * s2/s1  #scale our output to a mean of 15 and std deviation of 3



    X = df
    y = target

    if scale:
        df_pp = preprocessing.scale(df)
        print("scaled")

        X_train, X_test, y_train, y_test = train_test_split(df_pp, target, test_size=test_size, random_state=42)

    else:
        df_pp = None
        X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=test_size, random_state=42)


    return df_unprocessed, df, df_pp, target, X, X_train, X_test, y, y_train, y_test
  def test_split(self):
    ds = self.create_dataset()
    indexes = list(range(len(ds)))
    train, test = train_test_split(indexes)
    train, valid = train_test_split(train)

    splitter = SpecifiedIndexSplitter(train, valid, test)
    train_ds, valid_ds, test_ds = splitter.train_valid_test_split(ds)

    self.assertTrue(np.all(train_ds.X == ds.X[train]))
    self.assertTrue(np.all(valid_ds.X == ds.X[valid]))
    self.assertTrue(np.all(test_ds.X == ds.X[test]))
def resample(X, y, sample_fraction=0.1, test_size=0.3):
    X_columns = X.columns
    y_columns = y.columns
    n = len(X_columns)

    print('~' * 80)
    print('@@-\n', y.converted.value_counts())
    print('@@0 - Original')
    show_balance(y.values)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print('@@2 - y_train')
    show_balance(y_train)
    print('@@2 -  y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    ros = RandomOverSampler(random_state=42)
    X_train, y_train = ros.fit_sample(X_train, y_train)
    X_test, y_test = ros.fit_sample(X_test, y_test)
    print('@@3 - Oversampled y_train')
    show_balance(y_train)
    print('@@3 - Oversampled y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    if sample_fraction < 1.0:
        _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43)
        _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44)
        print('@@2 - Downsampled y_train')
        show_balance(y_train)
        print('@@2 - Downsampled y_test')
        show_balance(y_test)
        assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape)
        assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape)

    print('X_columns=%d %s' % (len(X_columns), X_columns))
    print('y_columns=%d %s' % (len(y_columns), y_columns))
    print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape)))
    print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape)))
    assert X_train.shape[1] == n and X_test.shape[1] == n

    X_train = pd.DataFrame(X_train, columns=X_columns)
    y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index)
    X_test = pd.DataFrame(X_test, columns=X_columns)
    y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index)
    print('@@+ y_train\n', y_train.converted.value_counts(), flush=True)
    print('@@+ y_test\n', y_test.converted.value_counts(), flush=True)

    return (X_train, y_train), (X_test, y_test)
def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    with pytest.raises(ValueError, match="multiclass format is not supported"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # test error is raised with a single class present in model
    # (predict_proba shape is not suitable for binary auc)
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, np.zeros_like(y_train))
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # for proba scorers
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('neg_log_loss')(clf, X_test, y_test)
def filter_split_data(X_raw, y_raw, metadatas, max_cloud_cover=1, timespan_before=np.inf, test_fraction=0.3, val_fraction=0.3, random_seed=0, normalized=True, balanced_classes=True, filter_center_cloudy=False):
    X, y, metadata_filtered = filter_data(X_raw, y_raw, metadatas, max_cloud_cover=max_cloud_cover, timespan_before=timespan_before, random_seed=random_seed, normalized=normalized, balanced_classes=balanced_classes, filter_center_cloudy=filter_center_cloudy)

    X, y, metadata_filtered=shuffle(X, y, metadata_filtered, random_state=random_seed)

    X_train, X_test, y_train, y_test, metadata_train, metadata_test=train_test_split(
        X, y, metadata_filtered, test_size=test_fraction, random_state=random_seed)

    X_train, X_val, y_train, y_val, metadata_train, metadata_val=train_test_split(
        X_train, y_train, metadata_train, test_size=val_fraction, random_state=random_seed)
#     print(X_train.shape,y_train.shape, len(metadata_train))
#     print(X_test.shape,y_test.shape, len(metadata_test))
#     print(X_val.shape,y_val.shape, len(metadata_val))

    return X_train, y_train, metadata_train, X_val, y_val, metadata_val, X_test, y_test, metadata_test
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)
            assert_array_equal(sparse_results, dense_results)
def test_feature_importance_regression():
    """Test that Gini importance is calculated correctly.

    This test follows the example from [1]_ (pg. 373).

    .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
       of statistical learning. New York: Springer series in statistics.
    """
    california = fetch_california_housing()
    X, y = california.data, california.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1,
                                    max_leaf_nodes=6, n_estimators=100,
                                    random_state=0)
    reg.fit(X_train, y_train)
    sorted_idx = np.argsort(reg.feature_importances_)[::-1]
    sorted_features = [california.feature_names[s] for s in sorted_idx]

    # The most important feature is the median income by far.
    assert sorted_features[0] == 'MedInc'

    # The three subsequent features are the following. Their relative ordering
    # might change a bit depending on the randomness of the trees and the
    # train / test split.
    assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
def test_gradient_boosting_early_stopping():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=1000,
                                     n_iter_no_change=10,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)

    gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=42)
    # Check if early_stopping works as expected
    for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13),
                                              (gbc, 1e-3, 36),
                                              (gbr, 1e-3, 28)):
        est.set_params(tol=tol)
        est.fit(X_train, y_train)
        assert_equal(est.n_estimators_, early_stop_n_estimators)
        assert est.score(X_test, y_test) > 0.7

    # Without early stopping
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                     max_depth=3, random_state=42)
    gbc.fit(X, y)
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                    max_depth=3, random_state=42)
    gbr.fit(X, y)

    assert gbc.n_estimators_ == 100
    assert gbr.n_estimators_ == 200
示例#22
0
    def fit(self, x, y, trainer_args=None):
        """Trains the model on the dataset given.

        Args:
            x: A numpy.ndarray instance containing the training data or the training data combined with the
               validation data.
            y: A numpy.ndarray instance containing the label of the training data. or the label of the training data
               combined with the validation label.
            trainer_args: A dictionary containing the parameters of the ModelTrainer constructor.
        """
        validate_xy(x, y)
        self.y_encoder.fit(y)
        y = self.y_encoder.transform(y)
        # Divide training data into training and testing data.
        validation_set_size = int(len(y) * Constant.VALIDATION_SET_SIZE)
        validation_set_size = min(validation_set_size, 500)
        validation_set_size = max(validation_set_size, 1)
        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=validation_set_size,
                                                            random_state=42)

        #initialize data_transformer
        self.data_transformer = self.data_transformer_class(x_train)
        # Wrap the data into DataLoaders
        train_loader = self.data_transformer.transform_train(x_train, y_train)
        test_loader = self.data_transformer.transform_test(x_test, y_test)

        self.generator = self._init_generator(self.y_encoder.n_classes, x_train.shape[1:])
        graph = self.generator.generate()

        if trainer_args is None:
            trainer_args = {'max_no_improvement_num': 30}
        _, _1, self.graph = train(None, graph, train_loader, test_loader,
                                  trainer_args, self.metric, self.loss,
                                  self.verbose, self.path)
def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=100,
                                     n_iter_no_change=10,
                                     validation_fraction=0.1,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    validation_fraction=0.1,
                                    random_state=42)
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_
示例#24
0
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
    """Generate a regression dataset with the given parameters."""
    if verbose:
        print("generating dataset...")

    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features, noise=noise, coef=True)

    random_seed = 13
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=n_train, random_state=random_seed)
    X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)

    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
    y_test = y_scaler.transform(y_test[:, None])[:, 0]

    gc.collect()
    if verbose:
        print("ok")
    return X_train, y_train, X_test, y_test
示例#25
0
def test_decision_function_shape():
    # check that decision_function_shape='ovr' gives
    # correct shape and is consistent with predict

    clf = svm.SVC(kernel='linear', C=0.1,
                  decision_function_shape='ovr').fit(iris.data, iris.target)
    dec = clf.decision_function(iris.data)
    assert_equal(dec.shape, (len(iris.data), 3))
    assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1))

    # with five classes:
    X, y = make_blobs(n_samples=80, centers=5, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf = svm.SVC(kernel='linear', C=0.1,
                  decision_function_shape='ovr').fit(X_train, y_train)
    dec = clf.decision_function(X_test)
    assert_equal(dec.shape, (len(X_test), 5))
    assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1))

    # check shape of ovo_decition_function=True
    clf = svm.SVC(kernel='linear', C=0.1,
                  decision_function_shape='ovo').fit(X_train, y_train)
    dec = clf.decision_function(X_train)
    assert_equal(dec.shape, (len(X_train), 10))

    # check deprecation warning
    clf = svm.SVC(kernel='linear', C=0.1).fit(X_train, y_train)
    msg = "change the shape of the decision function"
    dec = assert_warns_message(ChangedBehaviorWarning, msg,
                               clf.decision_function, X_train)
    assert_equal(dec.shape, (len(X_train), 10))
示例#26
0
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)
示例#27
0
文件: get_data.py 项目: Suluo/Kaggle
def pro_progess(filepath="../data"):
    height = 299
    train_files = os.listdir(filepath + '/train')
    train = np.zeros((len(train_files), height, height, 3), dtype=np.uint8)
    labels = list(filter(lambda x: x[:3] == 'dog', train_files))

    test_files = os.listdir(filepath + '/test')
    test = np.zeros((len(test_files), height, height, 3), dtype=np.uint8)

    for i in tqdm(range(len(train_files))):
        filename = filepath + train_files[i]
        img = cv2.imread(filename)
        img = cv2.resize(img, (height, height))
        train[i] = img[:, :, ::-1]

    for i in tqdm(range(len(test_files))):
        filename = filepath + test_files[i]
        img = cv2.imread(filename)
        img = cv2.resize(img, (height, height))
        test[i] = img[:, :, ::-1]

    print ('Training Data Size = %.2 GB' % (sys.getsizeof(train)/1024**3))
    print ('Testing Data Size = %.2 GB' % (sys.getsizeof(test)/1024**3))
    X_train, X_val, y_train, y_val = train_test_split(
        train, labels, shuffle=True, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val
示例#28
0
def test_count_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    # split the dataset for model development and final evaluation
    train_data, test_data, target_train, target_test = train_test_split(
        data, target, test_size=.2, random_state=0)

    pipeline = Pipeline([('vect', CountVectorizer()),
                         ('svc', LinearSVC())])

    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'svc__loss': ('hinge', 'squared_hinge')
    }

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # Check that the best model found by grid search is 100% correct on the
    # held out evaluation set.
    pred = grid_search.fit(train_data, target_train).predict(test_data)
    assert_array_equal(pred, target_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accuracy models
    assert_equal(grid_search.best_score_, 1.0)
    best_vectorizer = grid_search.best_estimator_.named_steps['vect']
    assert_equal(best_vectorizer.ngram_range, (1, 1))
def main(unused_argv):
  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = model_selection.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

  classifier = tf.estimator.Estimator(model_fn=my_model)

  # Train.
  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
  classifier.train(input_fn=train_input_fn, steps=1000)

  # Predict.
  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
  predictions = classifier.predict(input_fn=test_input_fn)
  y_predicted = np.array(list(p['class'] for p in predictions))
  y_predicted = y_predicted.reshape(np.array(y_test).shape)

  # Score with sklearn.
  score = metrics.accuracy_score(y_test, y_predicted)
  print('Accuracy (sklearn): {0:f}'.format(score))

  # Score with tensorflow.
  scores = classifier.evaluate(input_fn=test_input_fn)
  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
示例#30
0
    def learn(self, X, y):
        """
        Learn the best model for the data.

        Parameters
        ----------
        X : nd-array
            Data array (n_samples, n_features)
        y : nd-array
            Targets.

        Returns
        -------
        """

        # split into train/validation (default 75/25)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size_, random_state=self.seed_)

        # loop through list of supervised learning classification methods
        if self.model_type_ == "classification":
            models = [MLPClassifier(alpha=1, max_iter=1000),
                KNeighborsClassifier(),
                SVC(kernel="linear", C=0.025),
                SVC(kernel="poly", C=1),
                SVC(kernel="rbf", gamma=2, C=1),
                SVC(kernel="sigmoid", C=1),
                GaussianProcessClassifier(RBF()),
                GaussianProcessClassifier(ConstantKernel()),
                GaussianProcessClassifier(RationalQuadratic()),
                DecisionTreeClassifier(max_depth=5),
                RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
                AdaBoostClassifier(),
                GaussianNB(),
                LinearDiscriminantAnalysis(),
                QuadraticDiscriminantAnalysis()]

            best_score = 0

        else:
            models = [MLPRegressor(alpha=1, max_iter=1000),
                KNeighborsRegressor(),
                SVR(kernel="linear", C=0.025),
                SVR(kernel="poly", C=1),
                SVR(kernel="rbf", gamma=2, C=1),
                SVR(kernel="sigmoid", C=1),
                GaussianProcessRegressor(RBF()),
                GaussianProcessRegressor(ConstantKernel()),
                GaussianProcessRegressor(RationalQuadratic()),
                DecisionTreeRegressor(max_depth=5),
                RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1),
                AdaBoostRegressor(),
                Lasso()]

            best_score = inf

        # for each model, fit on training data then predict on testing data

        for m in models:
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)

            if self.model_type_ == "classification":
                score = accuracy_score(y_test, y_pred)

                # if testing accuracy is the best so far
                if score > best_score:
                    # set it as the best score and best model
                    best_score = score
                    self.model_ = m
                score_name = "accuracy score"
            else:
                score = mean_squared_error(y_test, y_pred)

                # if testing accuracy is the best so far
                if score < best_score:
                    # set it as the best score and best model
                    best_score = score
                    self.model_ = m
                score_name = "MSE"

        # grid search parameterss for best model and save

        # done learning
        print("My big brain has learned everything.\n")

        if self.verbose_:
            print("Best model: %s" % self.model_)
            print("Best %s: %0.3f" % (score_name, best_score))
示例#31
0
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(previsores[:, 1:4])
previsores[:, 1:4] = imputer.transform(previsores[:, 1:4])

# =============================================================================
# =============== PADRONIZAÇÃO DOS VALORES ===============
# =============================================================================
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

# =============================================================================
# ============= CRIAÇÃO DA BASE DE TREINAMENTO E BASE DE TESTE ================
# =============================================================================
from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.25, random_state=0)

# IMPORTAÇÃO DA BIBLIOTECA
from tensorflow import keras
#from keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

# CLASSIFICADOR
classificador = keras.models.Sequential()

# PRIMEIRA CAMADA OCULTA
classificador.add(Dense(units=2, activation='relu', input_dim=3))

# OUTRA CAMADA OCULTA:
classificador.add(Dense(units=2, activation='relu'))
def exp_hog_parameters(args, var, to_log=True):
    # define feature parameters
    cell_per_block  = var['cell_per_block'] # 2
    color_space     = var['color_space']    # can be RGB, HSV, LUV, HLS, YUV, YCrCb
    hist_bins       = var['hist_bins']      # 32  # number of histogram bins
    hist_feat       = var['hist_feat']      # histogram features on or off
    hog_channel     = var['hog_channel']    # 'ALL' # can be 0, 1, 2, or 'ALL'
    hog_feat        = var['hog_feat']       # HOG features on or off
    orient          = var['orient']         # 8
    overlap         = var['overlap']        # 0.5
    pix_per_cell    = var['pix_per_cell']   # 8
    scale           = var['scale']          # 1.0
    spatial_feat    = var['spatial_feat']   # True, spatial features on or off
    spatial_size    = var['spatial_size']   # (32,32)  # spatial binning dimensions
    x_start_stop    = var['x_start_stop']   # [None, None]
    y_start_stop    = var['y_start_stop']   # [400, 656]
    xy_window       = var['xy_window']      # (128, 128)

    # list_all_images
    cars, notcars = list_all_images(args)

    # choose random car/notcar indices
    flag_random = False
    if flag_random:
        car_ind = np.random.randint(0, len(cars))
        notcar_ind = np.random.randint(0, len(notcars))
    else:
        car_ind, notcar_ind = 2734, 7868

    # read in car / notcar images
    car_image = mpimg.imread(cars[car_ind])
    notcar_image = mpimg.imread(notcars[notcar_ind])

    num_img = 5
    flag_random = False
    if flag_random:
        cars_image_to_plot = [[cars[index].split('\\')[-1][:-4], cars[index]] for index in
                              [random.randint(0, len(cars)) for i in range(num_img)]]
        notcars_image_to_plot = [[notcars[index].split('\\')[-1][:-4], notcars[index]] for index in
                                 [random.randint(0, len(notcars)) for i in range(num_img)]]
    else:
        cars_image_to_plot = [[index, cars[index]] for index in
                              [random.randint(0, len(cars)) for i in range(num_img)]]
        notcars_image_to_plot = [[index, notcars[index]] for index in
                                 [random.randint(0, len(notcars)) for i in range(num_img)]]

    car_features, car_hog_image = single_img_features(car_image, color_space=color_space, spatial_size=spatial_size,
                                                      hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell,
                                                      cell_per_block=cell_per_block, hog_channel=hog_channel,
                                                      spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat,
                                                      vis=True)

    notcar_features, notcar_hog_image = single_img_features(notcar_image, color_space=color_space,
                                                            spatial_size=spatial_size,
                                                            hist_bins=hist_bins, orient=orient,
                                                            pix_per_cell=pix_per_cell,
                                                            cell_per_block=cell_per_block, hog_channel=hog_channel,
                                                            spatial_feat=spatial_feat, hist_feat=hist_feat,
                                                            hog_feat=hog_feat,
                                                            vis=True)

    t            = time.time()
    n_samples    = 1000
    random_idxs  = np.random.randint(0, len(cars), n_samples)
    test_cars    = np.array(cars)[random_idxs]
    test_noncars = np.array(notcars)[random_idxs]

    car_features = extract_features(test_cars, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins,
                                    orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block,
                                    hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat)

    notcar_features = extract_features(test_noncars, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins,
                                    orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block,
                                    hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat)

    t_feature_computation = round(time.time() - t, 2)
    print(t_feature_computation, 'Seconds to compute features...')
    X = np.vstack((car_features, notcar_features)).astype(np.float64)
    # fit a per_column scaler
    X_scaler = StandardScaler().fit(X)
    # apply the scaler to X
    scaled_X = X_scaler.transform(X)

    # define the labels vector
    y = np.hstack(( np.ones(len(car_features)), np.zeros(len(notcar_features)) ))

    # split up data into randomized training and test sets
    rand_state = np.random.randint(0, 100)
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.1, random_state=rand_state)

    print('Using:', orient, 'orientations,', pix_per_cell, 'pixels per cell,', cell_per_block,
          'cells per block,', hist_bins, 'histogram bins, and', spatial_size, 'spatial sampling')
    print('Feature vector length:', len(X_train[0]))

    # use a linear SVC
    svc = LinearSVC()
    # check the training time for the SVC
    t   = time.time()

    svc.fit(X_train, y_train) # https://stackoverflow.com/questions/40524790/valueerror-this-solver-needs-samples-of-at-least-2-classes-in-the-data-but-the
    t_train = round(time.time()-t, 2)
    print(t_train, 'Seconds to train SVC...')
    # check the score of the SVC
    accuracy = round(svc.score(X_test, y_test), 4)
    print('Test Accuracy of SVC = ', accuracy)

    log = [ cell_per_block, color_space, hist_bins,
            hist_feat, hog_channel, orient,
            pix_per_cell, spatial_feat, spatial_size,
            accuracy, len(X_train[0]), t_feature_computation, t_train, t_feature_computation+t_train  ]

    # log = [ var['cell_per_block'], var['color_space'], var['hist_bins'],
    #         var['hist_feat'], var['hog_channel'], var['orient'],
    #         var['pix_per_cell'], var['spatial_feat'], var['spatial_size'],
    #         accuracy, len(X_train[0]), t_feature_computation, t_train, t_feature_computation+t_train  ]

    if to_log: log_write(args, log)
# In[ ]:

#Let's review X before we proceed

print(X.describe())
print("\n")
print(X.head())

# # Building the model using Decision Tree Regressor

# In[ ]:

from sklearn.model_selection import train_test_split
#Splitting the data into train and validation

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# In[ ]:

from sklearn.tree import DecisionTreeRegressor

#setting random_state=1 for reproducibility
titanic_model = DecisionTreeRegressor(random_state=1)

#fitting the model now
titanic_model.fit(train_X, train_y)

# In[ ]:

#We're just testing how well fitted the model is here.
titanic_preds = titanic_model.predict(val_X)
示例#34
0
def main():
    PATH = "../iris.csv"
    columns = [
        "sepal-length", "sepal-width", "petal-length", "petal-width", "class"
    ]
    df = read_csv(PATH, names=columns)
    df_dimension = df.shape
    df_head = df.head(5)
    df_summary = df.describe()
    df_class_distribution = df.groupby("class").size()

    # Univariate plots
    # df.plot(kind="box", subplots=True, layout=(2, 2), sharex=False, sharey=False)
    # pyplot.show()
    # df.hist()
    # pyplot.show()
    # End univariate plots

    # Multivariate Plots
    # scatter_matrix(df)
    # pyplot.show()

    array = df.values
    X = array[:, 0:4]
    Y = array[:, 4]

    # Spot-Check Algorithm
    # models = [("LR", LogisticRegression(solver="lbfgs", max_iter=1000)), ("LDA", LinearDiscriminantAnalysis()), ("KNN", KNeighborsClassifier()), ("CART", DecisionTreeClassifier()), ("NB", GaussianNB()), ("SVM", SVC())]

    # model_results = []
    # model_names = []

    # i = 0

    # while i < len(models):
    # 	el = models[i]
    # 	kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    # 	cv_results = cross_val_score(el[1], X_train, Y_train, cv=kfold, scoring="accuracy")
    # 	print(f"model: {el[0]}, results: mean={cv_results.mean() * 100:.3f} std={cv_results.std()*100:.3f}")
    # 	model_results.append(cv_results)
    # 	model_names.append(el[0])
    # 	i += 1
    # 	if i == len(models):
    # 		# fig = pyplot.figure()
    # 		# fig.suptitle("Algorithm Comparison")
    # 		# ax = fig.add_subplot(111)
    # 		# pyplot.boxplot(model_results)
    # 		# ax.set_xticklabels(model_names)
    # 		# pyplot.show()
    # 		break
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.20,
                                                        random_state=7)
    m_knn = KNeighborsClassifier()
    m_knn.fit(X_train, Y_train)
    predictions = m_knn.predict(X_test)
    print(f"Accuracy {accuracy_score(Y_test, predictions) * 100:.2f}%")
    print(f"Confusion Matrix {confusion_matrix(Y_test, predictions)}")
    print(
        f"Classification report {classification_report(Y_test, predictions)}")
示例#35
0
# ### Split Data

train_test_size = 0.8
train_size = 0.625
data_aug = False
batch_size = 16
rand_state = None  #1337

df = data[keep]
# df = df[df.index > '2016']   # only keep data after 2015
# labels = labels.loc[df.index]

train_test_idx, hold_idx, y_train_test, y_hold = train_test_split(
    np.arange(len(df)),
    labels.values.ravel(),
    train_size=train_test_size,
    shuffle=True,
    stratify=labels.values.ravel(),
    random_state=rand_state)

X_train_test = df.iloc[train_test_idx].values
X_hold = df.iloc[hold_idx].values

train_idx, test_idx, y_train, y_test = train_test_split(
    train_test_idx,
    y_train_test,
    train_size=train_size,
    shuffle=True,
    stratify=y_train_test,
    random_state=rand_state)
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-18.04580688663451
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)),
        FunctionTransformer(copy)
    ),
    LinearSVR(C=5.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.1)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Convolution1D,MaxPooling1D,Activation,Flatten,Dropout

one_hot_list=['2', '3', '4', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'T', 'U', 'V', 'W', 'X',
       'Y', 'Z']


FILE_PATH="model2.h5"#模型进行存储和读取的地方
IMAGE_SIZE=30
# imgs,labels,counter=read_file(PATH,IMAGE_SIZE)
dataset=pd.read_hdf('./data/train.h5',key='train')
dataset_x=dataset.iloc[:,0:900].values
dataset_x=dataset_x.reshape(dataset_x.shape[0],IMAGE_SIZE,IMAGE_SIZE)/255.0
dataset_y=pd.get_dummies(dataset.iloc[:,900]).values
x_train,x_test,y_train,y_test=train_test_split(dataset_x,dataset_y,test_size=0.2,random_state=0)
model=Sequential()
model.add(Convolution1D(filters=32,kernel_size=3,padding='same',input_shape=x_train.shape[1:]))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2,strides=2,padding='same'))
model.add(Convolution1D(filters=64,kernel_size=3,padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2,strides=2,padding='same'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(dataset_y.shape[1]))
model.add(Activation('softmax'))
model.summary()
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
示例#38
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

nyc2 = pd.read_csv("ave_yearly_temp_nyc_1895-2017.csv")
# One column is a series

X_train, X_test, y_train, y_test = train_test_split(
    nyc2.Date.values.reshape(-1, 1), nyc2.Value.values, random_state=11
)

linear_regression = LinearRegression()
linear_regression.fit(X=X_train, y=y_train)
print(linear_regression.coef_)
print(linear_regression.intercept_)

predicted = linear_regression.predict(X_test)
expected = y_test

for p, e in zip(
    predicted[::5], expected[::5]
):  # double colons say print every 5th element
    print(f"Predicted: {p:.2f}, expected: {e:.2f}")

# lambda implements y = mx + b
predict = lambda x: linear_regression.coef_ * x + linear_regression.intercept_

axes = sns.scatterplot(
dataset = pd.read_csv(
    r'C:\Sid Data\BITS\4th Sem\Udemey\Part 3 - Classification\Section 14 - Logistic Regression\Social_Network_Ads.csv'
)
# Ned to get age & salary correspondence as X
x_data = dataset.iloc[:, [2, 3]].values
print(x_data)
#x_data1 = dataset.iloc[:, :-1].values

row, columns = dataset.shape
column_index = columns - 1
y_data = dataset.iloc[:, column_index].values
print(y_data)

# Split Train & test set
from sklearn.model_selection import train_test_split
x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
    x_data, y_data, test_size=0.25, random_state=0)

#scale the data for running algorithms
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_data_train = sc_x.fit_transform(pd.DataFrame(x_data_train))
x_data_test = sc_x.fit_transform(pd.DataFrame(x_data_test))
print(x_data)
print(x_data_train)

#Logistic Regression-------------------------------------
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0)
log_reg.fit(x_data_train, y_data_train)
#Predict test results
y_data_pred = log_reg.predict(x_data_test)
示例#40
0
preprocess_text(technology, sentences, 'technology')
preprocess_text(car, sentences, 'car')
preprocess_text(entertainment, sentences, 'entertainment')
preprocess_text(military, sentences, 'military')
preprocess_text(sports, sentences, 'sports')
""" generate data set """
# Shuffle the order to produce a more reliable training set
random.shuffle(sentences)

for sentence in sentences:
    print(sentence[0], sentence[1])

# Divide the original data set into test sets of training sets, using sklearn's own segmentation function "Zip!"
content, tag = zip(*sentences)
content_train, content_test, tag_train, tag_test = train_test_split(
    content, tag, random_state=1234)
print(len(content_train))

# To extract useful features from noise reduction data, we extract bag of words model features from the text
vectorizer = CountVectorizer(
    analyzer='word',  # tokenize by character ngrams
    ngram_range=(1, 4),  # use ngrams of size 1 2 and 3
    max_features=20000)  # keep the most common 1000 ngrams
vectorizer.fit(content_train)


def get_features(content):
    vectorizer.transform(content)


# import classifier and train data
示例#41
0
    plt.legend(loc="best")

    return plt


# digits = load_digits()
# X, y = digits.data, digits.target

array = data.values
X = array[:,0:4]
y = array[:,4]
validation_size = .2
seed = 13
scoring = 'accuracy'

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X,y,test_size=validation_size, random_state=seed)

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)


title = "Learning Curves (Naive Bayes)"
estimator = GaussianNB()
plot_learning_curve(estimator, title, X_train, Y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=4)


title = "Learning curve (K Nearest Neighbours)"
estimator = KNeighborsClassifier()
plot_learning_curve(estimator, title, X_train, Y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=4)


plt.show()
data.isnull().values.any()
data[data.isnull().any(axis=1)] 
data ['a'] = pd.DataFrame({'a':range(30001)})
sampled_df = data[(data['a'] % 10) == 0]
sampled_df.shape
sampled_df_remaining = data[(data['a'] % 10) != 0]
sampled_df_remaining.shape
LoanY = sampled_df['default payment next month'].copy()
loan_features = ['LIMIT_BAL','SEX','EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
LoanX = sampled_df[loan_features].copy()
classifed_names= ['default payment next month']

trainTest = train_test_split(LoanX, LoanY, test_size=0.3, train_size=0.7, random_state=0)



from sklearn.model_selection import GridSearchCV, learning_curve
from datetime import datetime
from sklearn.metrics import confusion_matrix

cross_validations = 10
train_sizes_base = [100, 200, 400, 600,800,1000,1200,1400]

def plot_learning_curve(title, cv_curve):
#     _, _, test_scores_base = base_curve
    train_sizes, train_scores, test_scores = cv_curve
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
num_features = X.shape[1]

standardise = StandardScaler()
X = standardise.fit_transform(X)

# Labels need to be 0-4, not 4-5 for categorical
y -= 1
y = tf.keras.utils.to_categorical(y).astype(np.float64)

# Tuning H
accuracies = defaultdict(list)
for H in range(0, 10, 2):  # check H in {0, 2, 4, 6, 8}
    for i in range(10):  # 10-fold cross-validation
        print(f'H: {H}, Fold {i+1}')
        train_x, test_x, train_y, test_y = train_test_split(X,
                                                            y,
                                                            test_size=0.1)

        model = BfgsMlp(n_input=num_features, n_hidden=H, n_output=num_classes)
        model.fit(train_x, train_y, max_iterations=100)
        test_acc = model.accuracy(test_x, test_y)

        accuracies[H].append(test_acc)

accuracies = {H: sum(accs) / len(accs) for H, accs in accuracies.items()}
for H, acc in accuracies.items():
    print(f'Average accuracy for H = {H}: {acc*100:.2f}%')

# Once best H value is found make a model with this H and monitor training to check for over-fitting
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.1)
best_H = max(accuracies, key=accuracies.get)
for i in range(0, temp_row):
	for j in range(2, temp_col-1):
		x_temp.append(df.iloc[i, j])
	x.append( x_temp )
	y.append( df.iloc[i, -1] )
	x_temp = []

print ('x=')
print (x)
print ('y=')
print (y)

MinMax = MinMaxScaler()
x_new = MinMax.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.25)
print ('y_train =')
print (y_train)
print ('y_test =')
print (y_test)



cv = StratifiedKFold(n_splits= 5, shuffle= True)
C = np.arange(0.5, 5, 0.5)
param_grid = dict(C = C)
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 7)

model = SVR()
grid_search = GridSearchCV(model, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1, cv = kfold)
y_train = np.zeros(len(y_train))
示例#45
0
        if sigmoid(X[i], theta) > 0.5: y[i] = 1
    return y
    return theta


def accuracy(X_test, y_test):
    y_pred = predict(X_test, theta)
    return accuracy_score(y_test, y_pred)


# X_train, X_test, y_train, y_test
np.ones(n)
m, n = np.shape(X)
X_ex = np.c_[X, np.ones(m)]  # 扩展矩阵为 [x, 1]
#print (X_ex)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X_ex, y, test_size=0.5, random_state=0)
# 通过梯度下降法得到最优参数
theta = gradDscent_1(X_train, y_train)
# 做出预测 映射theta
y_pred = predict(X_test, theta)
m_test = np.shape(X_test)[0]
# 混淆矩阵的计算和预测精度
cfmat = np.zeros((2, 2))
for i in range(m_test):
    if y_pred[i] == y_test[i] == 0:
        cfmat[0, 0] += 1
    elif y_pred[i] == y_test[i] == 1:
        cfmat[1, 1] += 1
    elif y_pred[i] == 0:
        cfmat[1, 0] += 1
    elif y_pred[i] == 1:
示例#46
0
y = dataset.iloc[:, 3].values

# Taking care of missing data
#from sklearn.preprocessing import Imputer
#imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
#imputer = imputer.fit(X[:, 1:3])
#X[:, 1:3] = imputer.transform(X[:, 1:3])

# Encoding categorical data
# Encoding the Independent Variable
#from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#labelencoder_X = LabelEncoder()
#X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
#onehotencoder = OneHotEncoder(categorical_features = [0])
#X = onehotencoder.fit_transform(X).toarray()
## Encoding the Dependent Variable
#labelencoder_y = LabelEncoder()
#y = labelencoder_y.fit_transform(y)

#splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#Feature Scaling
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)
示例#47
0
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import time
import csv
from sklearn.neural_network import MLPRegressor

start = time.perf_counter()
train = pd.read_csv('train.csv')
X = train.drop(columns=['winPlacePerc'])
Y = train['winPlacePerc']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
clf = MLPRegressor(hidden_layer_sizes=(
    300,
    200,
    100,
    50,
),
                   activation='relu',
                   solver='adam',
                   alpha=0.0001,
                   batch_size=100,
                   learning_rate='constant',
                   learning_rate_init=0.001,
                   max_iter=200,
                   shuffle=True,
                   verbose=True,
                   early_stopping=True,
示例#48
0
def main():
    print 'Using Keras version: ', keras.__version__

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=0, type=int)
    parser.add_argument('-w', '--classweights', dest='classweights', help='Option to choose class weights', default='InverseSRYields', type=str)
    parser.add_argument('-s', '--sel', dest='selection', help='Option to choose selection', default='tH', type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    classweights_name = args.classweights
    selection = args.selection

    # Number of classes to use
    number_of_classes = 4

    # Create instance of output directory where all results are saved.
    output_directory = '2017samples_%s_%s/' % (selection,classweights_name)

    check_dir(output_directory)

    # Create plots subdirectory
    plots_dir = os.path.join(output_directory,'plots/')

    input_var_jsonFile = open('input_vars_SigRegion_wFwdJet.json','r')

    if selection == 'tH':
        selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)'#&& n_presel_jet>=3'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile,encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key,var in variable_list:
        column_headers.append(key)
    column_headers.append('EventWeight')
    column_headers.append('xsec_rwgt')
    column_headers.append('nEvent')

    # Create instance of the input files directory
    inputs_file_path = '/afs/cern.ch/work/j/jthomasw/private/IHEP/ttHML/github/ttH_multilepton/keras-DNN/samples/rootplas_LegacyMVA_1113/DiLepRegion/ttH2017TrainDNN2L/'

    # Load ttree into .csv including all variables listed in column_headers
    print '<train-DNN> Input file path: ', inputs_file_path
    outputdataframe_name = '%s/output_dataframe_%s.csv' %(output_directory,selection)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print '<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)
    else:
        print '<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)
        data = load_data(inputs_file_path,column_headers,selection_criteria)
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    # Make instance of plotter tool
    Plotter = plotter()

    # Create statistically independant lists train/test data (used to train/evaluate the network)
    traindataset, valdataset = train_test_split(data, test_size=0.2)
    #valdataset.to_csv('valid_dataset.csv', index=False)

    #print '<train-DNN> Training dataset shape: ', traindataset.shape
    #print '<train-DNN> Validation dataset shape: ', valdataset.shape

    # Remove last two columns (Event weight and xsrw) from column headers
    training_columns = column_headers[:-3]
    print '<train-DNN> Training features: ', training_columns

    # Select data from columns under the remaining column headers in traindataset
    X_train = traindataset[training_columns].values

    # Select data from 'target' as target for MVA
    Y_train = traindataset.target.astype(int)
    X_test = valdataset[training_columns].values
    Y_test = valdataset.target.astype(int)

    num_variables = len(training_columns)

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]
    train_df.drop(['EventWeight'], axis=1, inplace=True)
    train_df.drop(['xsec_rwgt'], axis=1, inplace=True)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot.png'
    #Plotter.correlation_matrix(train_df)
    #Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name)

    #sampleweights = traindataset.loc[:,'sampleWeight']*traindataset.loc[:,'EventWeight']
    sampleweights = traindataset.loc[:,'sampleWeight']
    sampleweights = np.array(sampleweights)

    # Dictionaries of class weights to combat class imbalance
    if classweights_name == 'balanced':
        tuned_weighted = class_weight.compute_class_weight('balanced', np.unique([0,1,2,3]), Y_train)
    if classweights_name == 'tunedweights':
        tuned_weighted = {0 : 7.67, 1 : 1.0, 2 : 4.62, 3 : 7.67}

    # Per instance weights calculation so we can correctly apply event weights to diagnostic plots
    train_weights = traindataset['EventWeight'].values * traindataset['xsec_rwgt'].values
    test_weights = valdataset['EventWeight'].values * valdataset['xsec_rwgt'].values

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)
    # Transform to one hot encoded arrays
    Y_train = np_utils.to_categorical(encoded_Y)
    Y_test = np_utils.to_categorical(encoded_Y_test)

    optimizer = 'Adam'#'Nadam'
    if do_model_fit == 1:
        histories = []
        labels = []
        # Define model and early stopping
        early_stopping_monitor = EarlyStopping(patience=100,monitor='val_loss',verbose=1)
        model3 = baseline_model(num_variables,optimizer,number_of_classes)

        # Fit the model
        # Batch size = examples before updating weights (larger = faster training)
        # Epochs = One pass over data (useful for periodic logging and evaluation)
        #history3 = model3.fit(X_train,Y_train,validation_split=0.2,epochs=500,batch_size=1000,verbose=1,shuffle=True,class_weight=tuned_weighted,callbacks=[early_stopping_monitor])
        history3 = model3.fit(X_train,Y_train,validation_split=0.2,epochs=300,batch_size=1500,verbose=1,shuffle=True,sample_weight=sampleweights,callbacks=[early_stopping_monitor])
        histories.append(history3)
        labels.append(optimizer)

        # Make plot of loss function evolution
        Plotter.plot_training_progress_acc(histories, labels)
        acc_progress_filename = 'DNN_acc_wrt_epoch.png'
        Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename)
        # Which model do you want the rest of the plots for?
        model = model3
    else:
        # Which model do you want to load?
        model_name = os.path.join(output_directory,'model.h5')
        print '<train-DNN> Loaded Model: %s' % (model_name)
        model = load_trained_model(model_name,num_variables,optimizer,number_of_classes)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory,'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory,'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory,'model_serialised.json')
    with open(model_json_name,'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,'model_schematic.png')
    plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    # Make overfitting plots of output nodes
    Plotter.overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights)

    # Get true process values for testing dataset
    original_encoded_test_Y = []
    for i in xrange(len(result_probs_test)):
        if Y_test[i][0] == 1:
            original_encoded_test_Y.append(0)
        if Y_test[i][1] == 1:
            original_encoded_test_Y.append(1)
        if Y_test[i][2] == 1:
            original_encoded_test_Y.append(2)
        if Y_test[i][3] == 1:
            original_encoded_test_Y.append(3)

    # Get true process integers for training dataset
    original_encoded_train_Y = []
    for i in xrange(len(result_probs)):
        if Y_train[i][0] == 1:
            original_encoded_train_Y.append(0)
        if Y_train[i][1] == 1:
            original_encoded_train_Y.append(1)
        if Y_train[i][2] == 1:
            original_encoded_train_Y.append(2)
        if Y_train[i][3] == 1:
            original_encoded_train_Y.append(3)

    # Get true class values for testing dataset
    result_classes_test = newencoder.inverse_transform(result_classes_test)
    result_classes_train = newencoder.inverse_transform(result_classes)

    # Create confusion matrices for training and testing performance
    Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index')
    Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png')
    Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index')
    Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png')

    Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'')
    Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png')
    Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'')
    Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png')

    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 0 , 'ttHnode')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 1 , 'Other')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 2 , 'ttWnode')
    Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 3 , 'tHQnode')
示例#49
0
        df.loc[-1] = ["covid", ('covid/' + filename)] 
        df.index = df.index + 1 
        df = df.sort_index() 
for dirpath, dirnames, filenames in os.walk('../input/covid19-pneumonia-normal-chest-xray-pa-dataset/pneumonia'):
    for filename in filenames:
        df.loc[-1] = ["pneumonia", ('pneumonia/' + filename)] 
        df.index = df.index + 1 
        df = df.sort_index()

df_y = df['class']
df_x = df['directory']


"""## Data preprocessing"""

X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, stratify=df_y, test_size=0.20, random_state=7)

df_x, df_y = X_train, Y_train

test = pd.concat([X_test, Y_test], axis = 1)
test.head()

# used to copy files according to each fold
def copy_images(df, directory):
    
    # input and output directory
    input_path = "../input/covid19-pneumonia-normal-chest-xray-pa-dataset"     
    output_path = "out/" + directory

    # remove all files from previous fold
    if os.path.exists(output_path):
示例#50
0
#             master_df = pd.concat([master_df, df], axis=1)

# This code is for the normalized data
for key in teste_list:
    df = pd.read_table(key, header=None)
    master_df = pd.concat([master_df, df], axis=1)

target_df = pd.read_table('profile.csv', header=None)
target_df = target_df.drop([1, 2, 3, 4], axis=1)

print(master_df)
# print(master_df.dtypes)
# print(target_df.dtypes)

x_train, x_test, y_train, y_test = train_test_split(master_df,
                                                    target_df,
                                                    test_size=0.3,
                                                    random_state=42)
knn = KNeighborsClassifier(n_neighbors=4)

knn.fit(x_train, y_train.values.ravel())
print(knn.score(x_test, y_test))
pred = knn.predict(x_test)
y_train_pred = knn.predict(x_train)
cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, pred)
print('confusion matrix teste\n', cm_test, '\n')
print('confusion matrix treino\n', cm_train, '\n')
print(classification_report(y_test, pred), '\n')
stop = timeit.default_timer()

print('Programa executado em ', stop - start, 'segundos\n')
示例#51
0
import pandas as pd
#para dividir los datos en train y test
from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import classification_report,confusion_matrix

train = pd.read_csv('../data/fashion-mnist_train.csv')

train.head()

train.describe().transpose()

X = train.drop('label', axis=1)

y = train['label']

#divide los datos de entrenamiento y los datos de pruebas
X_train, X_test, y_train, y_test = train_test_split(X, y)

################################################################################
# Set the parameters by cross-validation
param_grid = [{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(svm.SVC(C=1), param_grid, n_jobs=4, refit=True)

clf.fit(X_train, y_train)

print("------fin-----")
示例#52
0
with open('labeler/targets_large.csv') as handle:
    reader = csv.DictReader(handle)

    for row in reader:
        train_imgs.append('./labeler/car_ims/{}'.format(row['img'].strip()))
        # train_imgs.append(row['img'].strip())
        train_labels.append(row['color'])

print("loaded imgs and labels")
X, y = read_and_process_image(train_imgs, train_labels)

X = np.array(X)
y = np.array(y)

X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.3,
                                                  random_state=2)

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

print("Shape of train images is:", X_train.shape)
print("Shape of validation images is:", X_val.shape)
print("Shape of labels is:", y_train.shape)
print("Shape of labels is:", y_val.shape)

ntrain = len(X_train)
nval = len(X_val)

batch_size = 8
columns = 5
示例#53
0
    # Loop through each training image for the current person
    for person_img in pix:
        try:
            face = face_recognition.load_image_file("./train/" + person + "/" +
                                                    person_img)
            face_enc = face_recognition.face_encodings(face)[0]

            # Add face encoding for current image with corresponding label (name) to the training data
            encodings.append(face_enc)
            names.append(person)

        except:
            pass

X_train, X_test, y_train, y_test = train_test_split(encodings,
                                                    names,
                                                    test_size=0.2,
                                                    random_state=42)
print("train_test_split completed \n ")

# Create and train the SVC classifier
print("load dataset completed \n training model started")
clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)

print("model training completed \n saving the model")

y_pred = clf.predict(X_test)
#precision = average_precision_score(y_test, y_pred)
#print("average precision score is:")
#print(precision)
features = dataset.iloc[:, 2:].values  
labels = dataset.iloc[:, 1]

## label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
labels = labelencoder.fit_transform(labels)
print(labels)

labels= np.array( labels, dtype= np.float64)


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

"""
Unregularized model - Linear regressor
Regularized model - Ridge regression and lasso
"""
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge  # RidgeClassier is also there
示例#55
0
文件: liver_dev.py 项目: canesqui/nn
normalized.head()
#pdb.set_trace()
normalized['Age'] = pd.DataFrame(df, columns=['Age'])
#print(normalized)
normalized['Gender'] = pd.DataFrame(df, columns=['Gender'])
#print(normalized)
normalized['Dataset'] = pd.DataFrame(df, columns=['Dataset'])
print(normalized)
#import pdb; 
#pdb.set_trace()


#train_test_split comes from scikit library. It is possible to specify
# test_Size or train_size. In this case we are defining the test size as 20%
#of the dataset
train, test = train_test_split(normalized, test_size=0.2)

#validation dataset will be 20% of the train dataset
train, val = train_test_split(train, test_size=0.1)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('Dataset')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
示例#56
0
    def _load(self, small=True):

        df = pd.read_csv(os.path.join(self.datadir,'../../data/higgs/higgs-boson.csv'))


        cols_to_keep = ['DER_mass_MMC', 'DER_mass_transverse_met_lep','DER_mass_vis',
                        'DER_pt_h', 'DER_deltar_tau_lep','DER_pt_tot', 'DER_sum_pt',
                        'DER_pt_ratio_lep_tau','DER_met_phi_centrality', 'PRI_tau_pt',
                        'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta',
                        'PRI_lep_phi', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet','Label']

        if small:
            fr = .15
        else:
            fr = 1
        df['Label'] = df['Label'].replace('s',1)
        df['Label'] = df['Label'].replace('b',0)

        df = df[cols_to_keep].sample(frac=fr, random_state = 100)

        train, test = train_test_split(df, test_size=.25, random_state=100)



        X_train = np.array(train.iloc[:,:-1])
        y_train = np.array(train.iloc[:,-1])
        X_test = np.array(test.iloc[:,:-1])
        y_test = np.array(test.iloc[:,-1])

        scaler = StandardScaler().fit(X_train)

        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        fmt = ['%.18f']* (X_train.shape[1]) + ['%d']

        if(self.ass2):
            df = pd.read_csv(os.path.join(self.datadir,'../../../ml_randomized_optimization/ABAGAIL/src/exp/tests/higgs_train.txt'), header=None, sep=',')
            train = np.array(df.iloc[:-1250,:])
            test = np.array(df.iloc[-1250:,:])

            train = np.array(train)
            test = np.array(test)
            if(self.to_txt):
                self._send_to_txt(train[:5000], './higgs_train.txt', fmt=fmt)
                self._send_to_txt(test, './higgs_test.txt', fmt=fmt)

            self.train_data = {
                'features': train[:,:-1],
                'labels': train[:,-1]
            }

            self.test_data = {
                'features': test[:,:-1],
                'labels': test[:,-1]
            }
        else:
            self.train_data = {
                'features': X_train,
                'labels': y_train
            }

            self.test_data = {
                'features': X_test,
                'labels': y_test
            }
示例#57
0
        "upper class": 3
    }
})

df_cust = df_cust.replace(transformation)
df_features = df_cust.loc[:, df_cust.columns.difference(["outcome"])]
df_target = df_cust["outcome"]

rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=20)

rf.fit(df_features, df_target)
top_k_feats = rf.feature_importances_.argsort()[-10:]
X, y = df_cust.loc[:, df_features.columns[top_k_feats]], df_target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=.8, shuffle=True, stratify=df_target)

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y_train.values, dtype=torch.float32)

k = X_tensor.shape[1] - 1
frequentist_model = TorchLogisticRegression(k)
q = AutoDiagonalNormal(bayes_logistic)
svi = SVI(bayes_logistic,
          q,
          Adam({"lr": 1e-2}),
          loss=Trace_ELBO(),
示例#58
0
#init paths
data_path = './data/'
image_path = 'IMG/'

#get the steering data, throtlle and brake from csv file
with open(data_path + 'driving_log.csv', 'r', newline='') as f:
    reader = csv.reader(f, delimiter=',')
    for line in reader:
        samples.append(line)

#shuffle samples
samples = shuffle(samples)

#split the train and the validation sets - 80/20 percentage
train_samples, validation_samples = train_test_split(samples, test_size=0.2)

#set up variables for generators to be used later for the fit_generator function
train_generator = generator(train_samples, batch_size=32)

validation_generator = generator(validation_samples, batch_size=32)

### retreive model ####
my_model = Path('./model.h5')

if my_model.is_file():

    model = load_model('model.h5')

else:
X
X.shape
# help(X.std)
# X.std(axis=0) # So, that's why we skip scaling.
y
y.shape

### Data types and missing values identification
X.dtype
# X.isnull.sum()

# {0:.2f}.format()
from sklearn.model_selection import train_test_split
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14)
print("There are {} samples in the training dataset".format(X_train.shape[0]))
print("There are {} samples in the testing dataset".format(X_test.shape[0]))
print("Each sample has {} features".format(X_train.shape[1]))

print("The class distribution of training set is\n{}.".format(y.value_counts() / len(y)))

print("The class distribution of training set is\n{}.".format(y_train.value_counts() / len(y_train)))

print("The class distribution of test set is\n{}.".format(y_test.value_counts() / len(y_test)))

### Standardization
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
def plot_cross_val_selection():
    iris = load_iris()
    X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data,
                                                              iris.target,
                                                              random_state=0)

    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X_trainval, y_trainval)
    results = pd.DataFrame(grid_search.cv_results_)[15:]

    best = np.argmax(results.mean_test_score.values)
    plt.figure(figsize=(10, 3))
    plt.xlim(-1, len(results))
    plt.ylim(0, 1.1)
    for i, (_, row) in enumerate(results.iterrows()):
        scores = row[['test_split%d_test_score' % i for i in range(5)]]
        marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5,
                              alpha=.5)
        marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1,
                                markersize=10, markeredgecolor='k')
        if i == best:
            marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
                                    fillstyle="none", alpha=1, markersize=20,
                                    markeredgewidth=3)

    plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x
                                     in grid_search.cv_results_['params']],
               rotation=90)
    plt.ylabel("Validation accuracy")
    plt.xlabel("Parameter settings")
    plt.legend([marker_cv, marker_mean, marker_best],
               ["cv accuracy", "mean accuracy", "best parameter setting"],
               loc=(1.05, .4))