def train_test_split_mock_pandas(): # X mock dataframe X_df = MockDataFrame(X) X_train, X_test = train_test_split(X_df) assert_true(isinstance(X_train, MockDataFrame)) assert_true(isinstance(X_test, MockDataFrame)) X_train_arr, X_test_arr = train_test_split(X_df)
def main(_): if FLAGS.dataset == 'cifar10': (X_train, y_train), (_, _) = cifar10.load_data() X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0) else: with open('data/train.p', mode='rb') as f: train = pickle.load(f) X_train, X_val, y_train, y_val = train_test_split(train['features'], train['labels'], test_size=0.33, random_state=0) train_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_train') validation_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_validation') print("Resizing to", (w, h, ch)) print("Saving to ...") print(train_output_file) print(validation_output_file) with tf.Session() as sess: K.set_session(sess) K.set_learning_phase(1) model = create_model() print('Bottleneck training') train_gen = gen(sess, X_train, y_train, batch_size) bottleneck_features_train = model.predict_generator(train_gen(), X_train.shape[0]) data = {'features': bottleneck_features_train, 'labels': y_train} pickle.dump(data, open(train_output_file, 'wb')) print('Bottleneck validation') val_gen = gen(sess, X_val, y_val, batch_size) bottleneck_features_validation = model.predict_generator(val_gen(), X_val.shape[0]) data = {'features': bottleneck_features_validation, 'labels': y_val} pickle.dump(data, open(validation_output_file, 'wb'))
def test_base_estimator(): # Check base_estimator and its default values. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, Perceptron)) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, SVR))
def test_classification_with_validation(self): tol_places = 4 data_x, data_y = make_classification(n_samples=100, n_features=7, n_redundant=0, n_informative=7, n_clusters_per_class=2, random_state=3227) label_y = np.where(data_y == 0, 'A', 'B') train_x, test_x, train_y, test_y = train_test_split(data_x, label_y, test_size=0.25, random_state=3227) train_x, validate_x, train_y, validate_y = train_test_split( train_x, train_y, test_size=0.5, random_state=3227) params = { 'ref_functions': ('linear_cov',), 'criterion_type': 'bias_retrain', 'criterion_minimum_width': 5, 'max_layer_count': 5, 'verbose': 0, 'n_jobs': 'max' } model = Classifier(**params) model.fit(train_x, train_y, validation_data=(validate_x, validate_y)) pred_y = model.predict_proba(test_x) roc_auc = roc_auc_score(model.le.transform(test_y), pred_y) self.assertAlmostEqual(roc_auc, 0.76, places=tol_places) no1 = model.predict_neuron_output(test_x, 0, 0) no2 = model.predict_neuron_output(test_x, 1, 0)
def read(d): data = pd.read_table(path+uni+"_"+d+".txt",delimiter='\t') data['label'] = 0 for i in range(len(data.index)): if data.iloc[i,3]<1000: data.iloc[i,len(data.columns)-1]=1 else: data.iloc[i,len(data.columns)-1]=0 X_0 = data.iloc[:,7:len(data.columns)-1] y_0 = data.iloc[:,len(data.columns)-1] X_0,X_,y_0,y_ = train_test_split(X_0,y_0,test_size=0.0,random_state=3421) X_1,X_test,y_1,y_test = train_test_split(X_0,y_0,test_size=0.2,random_state=1257) X_2,X_3,y_2,y_3 = train_test_split(X_1,y_1,test_size=1-label_rate,random_state=11) ############## 整体预测与交互检验 ########### # scores_all = cross_val_score(RandomForestClassifier(n_estimators=500), X_1, y_1, cv=5, scoring='accuracy') # score_all_mean =scores_all.mean() # print(d+'5折交互检验:'+str(score_all_mean)) # rf_all = RandomForestClassifier(n_estimators=500).fit(X_1,y_1) # answer_rf_all = rf_all.predict(X_test) # accuracy_all = metrics.accuracy_score(y_test,answer_rf_all) # print(d+'整体预测:'+str(accuracy_all)) ################################################ return data,X_2,y_2,X_3,y_3,X_test,y_test
def reduce_dataset(uid): ds = load_validation_dataframe(uid) X_train, X_valid, X_test, y_train, y_valid, y_test = ds X=pd.concat((X_train,X_valid,X_test)) y=np.concatenate((y_train,y_valid,y_test)) if len(y) > 5000: neg_inds = [i for i, v in enumerate(y) if v==0] pos_inds = [i for i, v in enumerate(y) if v==1] n_neg = 5000 - len(pos_inds) neg_inds = sample(neg_inds, n_neg) inds = sorted(neg_inds + pos_inds) X = X.iloc[inds,:] y = y[inds] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.66666, random_state=42) Xtrain_fname = join(DATAFRAMES_FOLDER, "dfXtrain_%d_small.pickle" % uid) Xvalid_fname = join(DATAFRAMES_FOLDER, "dfXvalid_%d_small.pickle" % uid) Xtest_fname = join(DATAFRAMES_FOLDER, "dfXtestv_%d_small.pickle" % uid) ys_fname = join(DATAFRAMES_FOLDER, "ysv_%d_small.pickle" % uid) X_train.to_pickle(Xtrain_fname) X_valid.to_pickle(Xvalid_fname) X_test.to_pickle(Xtest_fname) pickle.dump((y_train, y_valid, y_test), open(ys_fname, 'wb')) return X_train, X_valid, X_test, y_train, y_valid, y_test
def stacking(): X_train,X_test,Y_train,Y_test =train_test_split(x,y, random_state=35, test_size=0.2) x1_test =np.zeros((X_test.shape[0],len(classifiers)))#存储第一层测试集的输出结果 x1_train =np.zeros((X_train.shape[0],len(classifiers))) print 'x1.shape',np.shape(x1_train) print 'y....',np.shape(Y_train) accuracy = np.zeros(len(classifiers))#每个模型的准确率 for train_index, test_index in sss.split(X_train, Y_train): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf_num = 0 for clf in classifiers: clf_name = clf.__class__.__name__ clf.fit(x_train, y_train) x1_train[test_index,clf_num]=clf.predict(x_test)#下层模型的训练集输入是上层模型对于对应测试集的预测输出 x1_test[:, clf_num] += clf.predict(X_test)#直接对测试集进行预测,总共有十次,进行平均 accuracy[clf_num] += (y_test == x1_train[test_index,clf_num]).mean()#该模型的准确率,十次平均 clf_num += 1 print np.shape(x1_train) print np.shape(y_train) x2_train,x2_test,y2_train,y2_test =train_test_split(x1_train,Y_train,test_size=0.1) lr =LogisticRegression() lr.fit(x2_train,y2_train) print lr.predict(x1_test) print Y_test
def learning( self): X = self.X y = self.y print( "Shape of X and y are", X.shape, y.shape) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2, random_state=42) val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val, early_stopping_rounds=200) model = skflow.TensorFlowDNNRegressor(hidden_units=[100, 50, 10], steps=5000) model.fit(X_train, y_train, val_monitor) yP = model.predict(X_test) score_r2 = metrics.r2_score(y_test, yP) score_MedAE = metrics.median_absolute_error(y_test, yP) print('Accuracy') print('--------') print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE)) if self.graph: kutil.regress_show4( y_test, yP)
def __init__(self, root, train=True, val=False, color_space='lab', transform=None, test_size=0.9, val_size=0.125, location='cpu'): """ color_space: 'yub' or 'lab' """ self.root_dir = root all_files = [] for r, _, files in walk(self.root_dir): for f in files: if f.endswith('.jpg'): all_files.append(join(r, f)) train_val_files, test_files = train_test_split( all_files, test_size=test_size, random_state=69) train_files, val_files = train_test_split(train_val_files, test_size=val_size, random_state=69) if (train and val): self.filenames = val_files elif train: self.filenames = train_files else: self.filenames = test_files self.color_space = color_space if (self.color_space not in ['rgb', 'lab']): raise(NotImplementedError) self.transform = transform self.location = location self.nnenc = NNEncode(location=self.location) self.train = train
def lda_tuner(ingroup_otu, best_models): best_score = -1*np.inf dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] topic_series = [3] X = ingroup_otu.values eval_counter = 0 for topics in topic_series: for dtp in dtp_series: for twp in twp_series: eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=dtp, topic_word_prior=twp, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, topics, dtp, twp, this_score, this_perplexity) best_models.append({'n': topics, 'dtp': dtp, 'twp': twp, 'score': this_score, 'perp': this_perplexity}) if (dtp == dtp_series[-1]) and (twp == twp_series[-1]): eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=1./topics, topic_word_prior=1./topics, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, topics, (1./topics), (1./topics), this_score, this_perplexity) best_models.append({'n': topics, 'dtp': (1./topics), 'twp': (1./topics), 'score': this_score, 'perp': this_perplexity}) return best_models
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
def split_data(data): X_train, X_test, Y_train, Y_test = train_test_split(data.loc[:, data.columns != label], data[label], train_size=train_size + validation_size, test_size=test_size, shuffle=False, random_state=0) X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, train_size=train_size / (train_size + validation_size), test_size=validation_size / (train_size + validation_size), shuffle=False, random_state=0) return X_train, X_val, X_test, Y_train, Y_val, Y_test
def get_train_valid_test_split(n, train=0.7, valid=0.1, test=0.2, shuffle=False): other_split = valid+test if train+other_split!=1: raise ValueError("Train, Valid, Test splits should sum to 1") train_set, other_set = train_test_split(range(1,n+1), train_size=train, test_size=other_split, shuffle=shuffle) valid_set, test_set = train_test_split(other_set, train_size=valid/other_split, test_size=test/other_split, shuffle=False) print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set))) return train_set, valid_set, test_set
def preprocess(data, test_size, sample=None, scale=True): data_frame_all = pandas.read_table(data) df = data_frame_all # for simplicity for now--and since only 11093 or <3 % of our data, we're just gonna drop those rows no_null_df = df.dropna(axis=0, how='any') # this shows us that we no longer have null values no_null_df.isnull().values.any() # let's rename our new data frame df again. we're left with 238907 rows df = no_null_df df_unprocessed = df if sample: df = df.sample(frac=sample) print("sampled") df = df[['order_estimated_driving_time_min','order_estimated_shopping_time_min']] df['total_time_min'] = df.sum(axis=1) df['time_in_hours'] = df.total_time_min.divide(60) target = df.time_in_hours * 15 df = df.drop(['time_in_hours', 'total_time_min'], axis=1) s1 = target.std() s2 = 7.5 #our chosen std deviation m1 = target.mean() m2 = 15 #our chosen mean target = m2 + (target - m1) * s2/s1 #scale our output to a mean of 15 and std deviation of 3 X = df y = target if scale: df_pp = preprocessing.scale(df) print("scaled") X_train, X_test, y_train, y_test = train_test_split(df_pp, target, test_size=test_size, random_state=42) else: df_pp = None X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=test_size, random_state=42) return df_unprocessed, df, df_pp, target, X, X_train, X_test, y, y_train, y_test
def test_split(self): ds = self.create_dataset() indexes = list(range(len(ds))) train, test = train_test_split(indexes) train, valid = train_test_split(train) splitter = SpecifiedIndexSplitter(train, valid, test) train_ds, valid_ds, test_ds = splitter.train_valid_test_split(ds) self.assertTrue(np.all(train_ds.X == ds.X[train])) self.assertTrue(np.all(valid_ds.X == ds.X[valid])) self.assertTrue(np.all(test_ds.X == ds.X[test]))
def resample(X, y, sample_fraction=0.1, test_size=0.3): X_columns = X.columns y_columns = y.columns n = len(X_columns) print('~' * 80) print('@@-\n', y.converted.value_counts()) print('@@0 - Original') show_balance(y.values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) print('@@2 - y_train') show_balance(y_train) print('@@2 - y_test') show_balance(y_test) assert X_train.shape[1] == n and X_test.shape[1] == n ros = RandomOverSampler(random_state=42) X_train, y_train = ros.fit_sample(X_train, y_train) X_test, y_test = ros.fit_sample(X_test, y_test) print('@@3 - Oversampled y_train') show_balance(y_train) print('@@3 - Oversampled y_test') show_balance(y_test) assert X_train.shape[1] == n and X_test.shape[1] == n if sample_fraction < 1.0: _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43) _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44) print('@@2 - Downsampled y_train') show_balance(y_train) print('@@2 - Downsampled y_test') show_balance(y_test) assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape) assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape) print('X_columns=%d %s' % (len(X_columns), X_columns)) print('y_columns=%d %s' % (len(y_columns), y_columns)) print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape))) print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape))) assert X_train.shape[1] == n and X_test.shape[1] == n X_train = pd.DataFrame(X_train, columns=X_columns) y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index) X_test = pd.DataFrame(X_test, columns=X_columns) y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index) print('@@+ y_train\n', y_train.converted.value_counts(), flush=True) print('@@+ y_test\n', y_test.converted.value_counts(), flush=True) return (X_train, y_train), (X_test, y_test)
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('neg_log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) with pytest.raises(ValueError, match="multiclass format is not supported"): get_scorer('roc_auc')(clf, X_test, y_test) # test error is raised with a single class present in model # (predict_proba shape is not suitable for binary auc) X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier() clf.fit(X_train, np.zeros_like(y_train)) with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('roc_auc')(clf, X_test, y_test) # for proba scorers with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('neg_log_loss')(clf, X_test, y_test)
def filter_split_data(X_raw, y_raw, metadatas, max_cloud_cover=1, timespan_before=np.inf, test_fraction=0.3, val_fraction=0.3, random_seed=0, normalized=True, balanced_classes=True, filter_center_cloudy=False): X, y, metadata_filtered = filter_data(X_raw, y_raw, metadatas, max_cloud_cover=max_cloud_cover, timespan_before=timespan_before, random_seed=random_seed, normalized=normalized, balanced_classes=balanced_classes, filter_center_cloudy=filter_center_cloudy) X, y, metadata_filtered=shuffle(X, y, metadata_filtered, random_state=random_seed) X_train, X_test, y_train, y_test, metadata_train, metadata_test=train_test_split( X, y, metadata_filtered, test_size=test_fraction, random_state=random_seed) X_train, X_val, y_train, y_val, metadata_train, metadata_val=train_test_split( X_train, y_train, metadata_train, test_size=val_fraction, random_state=random_seed) # print(X_train.shape,y_train.shape, len(metadata_train)) # print(X_test.shape,y_test.shape, len(metadata_test)) # print(X_val.shape,y_val.shape, len(metadata_val)) return X_train, y_train, metadata_train, X_val, y_val, metadata_val, X_test, y_test, metadata_test
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) assert_array_equal(sparse_results, dense_results)
def test_feature_importance_regression(): """Test that Gini importance is calculated correctly. This test follows the example from [1]_ (pg. 373). .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements of statistical learning. New York: Springer series in statistics. """ california = fetch_california_housing() X, y = california.data, california.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1, max_leaf_nodes=6, n_estimators=100, random_state=0) reg.fit(X_train, y_train) sorted_idx = np.argsort(reg.feature_importances_)[::-1] sorted_features = [california.feature_names[s] for s in sorted_idx] # The most important feature is the median income by far. assert sorted_features[0] == 'MedInc' # The three subsequent features are the following. Their relative ordering # might change a bit depending on the randomness of the trees and the # train / test split. assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
def test_gradient_boosting_early_stopping(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if early_stopping works as expected for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13), (gbc, 1e-3, 36), (gbr, 1e-3, 28)): est.set_params(tol=tol) est.fit(X_train, y_train) assert_equal(est.n_estimators_, early_stop_n_estimators) assert est.score(X_test, y_test) > 0.7 # Without early stopping gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) gbc.fit(X, y) gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42) gbr.fit(X, y) assert gbc.n_estimators_ == 100 assert gbr.n_estimators_ == 200
def fit(self, x, y, trainer_args=None): """Trains the model on the dataset given. Args: x: A numpy.ndarray instance containing the training data or the training data combined with the validation data. y: A numpy.ndarray instance containing the label of the training data. or the label of the training data combined with the validation label. trainer_args: A dictionary containing the parameters of the ModelTrainer constructor. """ validate_xy(x, y) self.y_encoder.fit(y) y = self.y_encoder.transform(y) # Divide training data into training and testing data. validation_set_size = int(len(y) * Constant.VALIDATION_SET_SIZE) validation_set_size = min(validation_set_size, 500) validation_set_size = max(validation_set_size, 1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=validation_set_size, random_state=42) #initialize data_transformer self.data_transformer = self.data_transformer_class(x_train) # Wrap the data into DataLoaders train_loader = self.data_transformer.transform_train(x_train, y_train) test_loader = self.data_transformer.transform_test(x_test, y_test) self.generator = self._init_generator(self.y_encoder.n_classes, x_train.shape[1:]) graph = self.generator.generate() if trainer_args is None: trainer_args = {'max_no_improvement_num': 30} _, _1, self.graph = train(None, graph, train_loader, test_loader, trainer_args, self.metric, self.loss, self.verbose, self.path)
def test_gradient_boosting_validation_fraction(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=100, n_iter_no_change=10, validation_fraction=0.1, learning_rate=0.1, max_depth=3, random_state=42) gbc2 = clone(gbc).set_params(validation_fraction=0.3) gbc3 = clone(gbc).set_params(n_iter_no_change=20) gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10, learning_rate=0.1, max_depth=3, validation_fraction=0.1, random_state=42) gbr2 = clone(gbr).set_params(validation_fraction=0.3) gbr3 = clone(gbr).set_params(n_iter_no_change=20) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if validation_fraction has an effect gbc.fit(X_train, y_train) gbc2.fit(X_train, y_train) assert gbc.n_estimators_ != gbc2.n_estimators_ gbr.fit(X_train, y_train) gbr2.fit(X_train, y_train) assert gbr.n_estimators_ != gbr2.n_estimators_ # Check if n_estimators_ increase monotonically with n_iter_no_change # Set validation gbc3.fit(X_train, y_train) gbr3.fit(X_train, y_train) assert gbr.n_estimators_ < gbr3.n_estimators_ assert gbc.n_estimators_ < gbc3.n_estimators_
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): """Generate a regression dataset with the given parameters.""" if verbose: print("generating dataset...") X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) random_seed = 13 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_train, random_state=random_seed) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train[:, None])[:, 0] y_test = y_scaler.transform(y_test[:, None])[:, 0] gc.collect() if verbose: print("ok") return X_train, y_train, X_test, y_test
def test_decision_function_shape(): # check that decision_function_shape='ovr' gives # correct shape and is consistent with predict clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovr').fit(iris.data, iris.target) dec = clf.decision_function(iris.data) assert_equal(dec.shape, (len(iris.data), 3)) assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1)) # with five classes: X, y = make_blobs(n_samples=80, centers=5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovr').fit(X_train, y_train) dec = clf.decision_function(X_test) assert_equal(dec.shape, (len(X_test), 5)) assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1)) # check shape of ovo_decition_function=True clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovo').fit(X_train, y_train) dec = clf.decision_function(X_train) assert_equal(dec.shape, (len(X_train), 10)) # check deprecation warning clf = svm.SVC(kernel='linear', C=0.1).fit(X_train, y_train) msg = "change the shape of the decision function" dec = assert_warns_message(ChangedBehaviorWarning, msg, clf.decision_function, X_train) assert_equal(dec.shape, (len(X_train), 10))
def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict)
def pro_progess(filepath="../data"): height = 299 train_files = os.listdir(filepath + '/train') train = np.zeros((len(train_files), height, height, 3), dtype=np.uint8) labels = list(filter(lambda x: x[:3] == 'dog', train_files)) test_files = os.listdir(filepath + '/test') test = np.zeros((len(test_files), height, height, 3), dtype=np.uint8) for i in tqdm(range(len(train_files))): filename = filepath + train_files[i] img = cv2.imread(filename) img = cv2.resize(img, (height, height)) train[i] = img[:, :, ::-1] for i in tqdm(range(len(test_files))): filename = filepath + test_files[i] img = cv2.imread(filename) img = cv2.resize(img, (height, height)) test[i] = img[:, :, ::-1] print ('Training Data Size = %.2 GB' % (sys.getsizeof(train)/1024**3)) print ('Testing Data Size = %.2 GB' % (sys.getsizeof(test)/1024**3)) X_train, X_val, y_train, y_val = train_test_split( train, labels, shuffle=True, test_size=0.2, random_state=42) return X_train, X_val, y_train, y_val
def test_count_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # label junk food as -1, the others as +1 target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS) # split the dataset for model development and final evaluation train_data, test_data, target_train, target_test = train_test_split( data, target, test_size=.2, random_state=0) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'svc__loss': ('hinge', 'squared_hinge') } # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # Check that the best model found by grid search is 100% correct on the # held out evaluation set. pred = grid_search.fit(train_data, target_train).predict(test_data) assert_array_equal(pred, target_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accuracy models assert_equal(grid_search.best_score_, 1.0) best_vectorizer = grid_search.best_estimator_.named_steps['vect'] assert_equal(best_vectorizer.ngram_range, (1, 1))
def main(unused_argv): iris = datasets.load_iris() x_train, x_test, y_train, y_test = model_selection.train_test_split( iris.data, iris.target, test_size=0.2, random_state=42) classifier = tf.estimator.Estimator(model_fn=my_model) # Train. train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True) classifier.train(input_fn=train_input_fn, steps=1000) # Predict. test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=test_input_fn) y_predicted = np.array(list(p['class'] for p in predictions)) y_predicted = y_predicted.reshape(np.array(y_test).shape) # Score with sklearn. score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy (sklearn): {0:f}'.format(score)) # Score with tensorflow. scores = classifier.evaluate(input_fn=test_input_fn) print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
def learn(self, X, y): """ Learn the best model for the data. Parameters ---------- X : nd-array Data array (n_samples, n_features) y : nd-array Targets. Returns ------- """ # split into train/validation (default 75/25) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size_, random_state=self.seed_) # loop through list of supervised learning classification methods if self.model_type_ == "classification": models = [MLPClassifier(alpha=1, max_iter=1000), KNeighborsClassifier(), SVC(kernel="linear", C=0.025), SVC(kernel="poly", C=1), SVC(kernel="rbf", gamma=2, C=1), SVC(kernel="sigmoid", C=1), GaussianProcessClassifier(RBF()), GaussianProcessClassifier(ConstantKernel()), GaussianProcessClassifier(RationalQuadratic()), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()] best_score = 0 else: models = [MLPRegressor(alpha=1, max_iter=1000), KNeighborsRegressor(), SVR(kernel="linear", C=0.025), SVR(kernel="poly", C=1), SVR(kernel="rbf", gamma=2, C=1), SVR(kernel="sigmoid", C=1), GaussianProcessRegressor(RBF()), GaussianProcessRegressor(ConstantKernel()), GaussianProcessRegressor(RationalQuadratic()), DecisionTreeRegressor(max_depth=5), RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1), AdaBoostRegressor(), Lasso()] best_score = inf # for each model, fit on training data then predict on testing data for m in models: m.fit(X_train, y_train) y_pred = m.predict(X_test) if self.model_type_ == "classification": score = accuracy_score(y_test, y_pred) # if testing accuracy is the best so far if score > best_score: # set it as the best score and best model best_score = score self.model_ = m score_name = "accuracy score" else: score = mean_squared_error(y_test, y_pred) # if testing accuracy is the best so far if score < best_score: # set it as the best score and best model best_score = score self.model_ = m score_name = "MSE" # grid search parameterss for best model and save # done learning print("My big brain has learned everything.\n") if self.verbose_: print("Best model: %s" % self.model_) print("Best %s: %0.3f" % (score_name, best_score))
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(previsores[:, 1:4]) previsores[:, 1:4] = imputer.transform(previsores[:, 1:4]) # ============================================================================= # =============== PADRONIZAÇÃO DOS VALORES =============== # ============================================================================= from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) # ============================================================================= # ============= CRIAÇÃO DA BASE DE TREINAMENTO E BASE DE TESTE ================ # ============================================================================= from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.25, random_state=0) # IMPORTAÇÃO DA BIBLIOTECA from tensorflow import keras #from keras.models import Sequential from tensorflow.keras.layers import Dense, Activation # CLASSIFICADOR classificador = keras.models.Sequential() # PRIMEIRA CAMADA OCULTA classificador.add(Dense(units=2, activation='relu', input_dim=3)) # OUTRA CAMADA OCULTA: classificador.add(Dense(units=2, activation='relu'))
def exp_hog_parameters(args, var, to_log=True): # define feature parameters cell_per_block = var['cell_per_block'] # 2 color_space = var['color_space'] # can be RGB, HSV, LUV, HLS, YUV, YCrCb hist_bins = var['hist_bins'] # 32 # number of histogram bins hist_feat = var['hist_feat'] # histogram features on or off hog_channel = var['hog_channel'] # 'ALL' # can be 0, 1, 2, or 'ALL' hog_feat = var['hog_feat'] # HOG features on or off orient = var['orient'] # 8 overlap = var['overlap'] # 0.5 pix_per_cell = var['pix_per_cell'] # 8 scale = var['scale'] # 1.0 spatial_feat = var['spatial_feat'] # True, spatial features on or off spatial_size = var['spatial_size'] # (32,32) # spatial binning dimensions x_start_stop = var['x_start_stop'] # [None, None] y_start_stop = var['y_start_stop'] # [400, 656] xy_window = var['xy_window'] # (128, 128) # list_all_images cars, notcars = list_all_images(args) # choose random car/notcar indices flag_random = False if flag_random: car_ind = np.random.randint(0, len(cars)) notcar_ind = np.random.randint(0, len(notcars)) else: car_ind, notcar_ind = 2734, 7868 # read in car / notcar images car_image = mpimg.imread(cars[car_ind]) notcar_image = mpimg.imread(notcars[notcar_ind]) num_img = 5 flag_random = False if flag_random: cars_image_to_plot = [[cars[index].split('\\')[-1][:-4], cars[index]] for index in [random.randint(0, len(cars)) for i in range(num_img)]] notcars_image_to_plot = [[notcars[index].split('\\')[-1][:-4], notcars[index]] for index in [random.randint(0, len(notcars)) for i in range(num_img)]] else: cars_image_to_plot = [[index, cars[index]] for index in [random.randint(0, len(cars)) for i in range(num_img)]] notcars_image_to_plot = [[index, notcars[index]] for index in [random.randint(0, len(notcars)) for i in range(num_img)]] car_features, car_hog_image = single_img_features(car_image, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat, vis=True) notcar_features, notcar_hog_image = single_img_features(notcar_image, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat, vis=True) t = time.time() n_samples = 1000 random_idxs = np.random.randint(0, len(cars), n_samples) test_cars = np.array(cars)[random_idxs] test_noncars = np.array(notcars)[random_idxs] car_features = extract_features(test_cars, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat) notcar_features = extract_features(test_noncars, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat) t_feature_computation = round(time.time() - t, 2) print(t_feature_computation, 'Seconds to compute features...') X = np.vstack((car_features, notcar_features)).astype(np.float64) # fit a per_column scaler X_scaler = StandardScaler().fit(X) # apply the scaler to X scaled_X = X_scaler.transform(X) # define the labels vector y = np.hstack(( np.ones(len(car_features)), np.zeros(len(notcar_features)) )) # split up data into randomized training and test sets rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.1, random_state=rand_state) print('Using:', orient, 'orientations,', pix_per_cell, 'pixels per cell,', cell_per_block, 'cells per block,', hist_bins, 'histogram bins, and', spatial_size, 'spatial sampling') print('Feature vector length:', len(X_train[0])) # use a linear SVC svc = LinearSVC() # check the training time for the SVC t = time.time() svc.fit(X_train, y_train) # https://stackoverflow.com/questions/40524790/valueerror-this-solver-needs-samples-of-at-least-2-classes-in-the-data-but-the t_train = round(time.time()-t, 2) print(t_train, 'Seconds to train SVC...') # check the score of the SVC accuracy = round(svc.score(X_test, y_test), 4) print('Test Accuracy of SVC = ', accuracy) log = [ cell_per_block, color_space, hist_bins, hist_feat, hog_channel, orient, pix_per_cell, spatial_feat, spatial_size, accuracy, len(X_train[0]), t_feature_computation, t_train, t_feature_computation+t_train ] # log = [ var['cell_per_block'], var['color_space'], var['hist_bins'], # var['hist_feat'], var['hog_channel'], var['orient'], # var['pix_per_cell'], var['spatial_feat'], var['spatial_size'], # accuracy, len(X_train[0]), t_feature_computation, t_train, t_feature_computation+t_train ] if to_log: log_write(args, log)
# In[ ]: #Let's review X before we proceed print(X.describe()) print("\n") print(X.head()) # # Building the model using Decision Tree Regressor # In[ ]: from sklearn.model_selection import train_test_split #Splitting the data into train and validation train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # In[ ]: from sklearn.tree import DecisionTreeRegressor #setting random_state=1 for reproducibility titanic_model = DecisionTreeRegressor(random_state=1) #fitting the model now titanic_model.fit(train_X, train_y) # In[ ]: #We're just testing how well fitted the model is here. titanic_preds = titanic_model.predict(val_X)
def main(): PATH = "../iris.csv" columns = [ "sepal-length", "sepal-width", "petal-length", "petal-width", "class" ] df = read_csv(PATH, names=columns) df_dimension = df.shape df_head = df.head(5) df_summary = df.describe() df_class_distribution = df.groupby("class").size() # Univariate plots # df.plot(kind="box", subplots=True, layout=(2, 2), sharex=False, sharey=False) # pyplot.show() # df.hist() # pyplot.show() # End univariate plots # Multivariate Plots # scatter_matrix(df) # pyplot.show() array = df.values X = array[:, 0:4] Y = array[:, 4] # Spot-Check Algorithm # models = [("LR", LogisticRegression(solver="lbfgs", max_iter=1000)), ("LDA", LinearDiscriminantAnalysis()), ("KNN", KNeighborsClassifier()), ("CART", DecisionTreeClassifier()), ("NB", GaussianNB()), ("SVM", SVC())] # model_results = [] # model_names = [] # i = 0 # while i < len(models): # el = models[i] # kfold = KFold(n_splits=10, random_state=7, shuffle=True) # cv_results = cross_val_score(el[1], X_train, Y_train, cv=kfold, scoring="accuracy") # print(f"model: {el[0]}, results: mean={cv_results.mean() * 100:.3f} std={cv_results.std()*100:.3f}") # model_results.append(cv_results) # model_names.append(el[0]) # i += 1 # if i == len(models): # # fig = pyplot.figure() # # fig.suptitle("Algorithm Comparison") # # ax = fig.add_subplot(111) # # pyplot.boxplot(model_results) # # ax.set_xticklabels(model_names) # # pyplot.show() # break X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=7) m_knn = KNeighborsClassifier() m_knn.fit(X_train, Y_train) predictions = m_knn.predict(X_test) print(f"Accuracy {accuracy_score(Y_test, predictions) * 100:.2f}%") print(f"Confusion Matrix {confusion_matrix(Y_test, predictions)}") print( f"Classification report {classification_report(Y_test, predictions)}")
# ### Split Data train_test_size = 0.8 train_size = 0.625 data_aug = False batch_size = 16 rand_state = None #1337 df = data[keep] # df = df[df.index > '2016'] # only keep data after 2015 # labels = labels.loc[df.index] train_test_idx, hold_idx, y_train_test, y_hold = train_test_split( np.arange(len(df)), labels.values.ravel(), train_size=train_test_size, shuffle=True, stratify=labels.values.ravel(), random_state=rand_state) X_train_test = df.iloc[train_test_idx].values X_hold = df.iloc[hold_idx].values train_idx, test_idx, y_train, y_test = train_test_split( train_test_idx, y_train_test, train_size=train_size, shuffle=True, stratify=y_train_test, random_state=rand_state)
import numpy as np import pandas as pd from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-18.04580688663451 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)), FunctionTransformer(copy) ), LinearSVR(C=5.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.1) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from tensorflow.keras.models import load_model from tensorflow.keras.layers import Convolution1D,MaxPooling1D,Activation,Flatten,Dropout one_hot_list=['2', '3', '4', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] FILE_PATH="model2.h5"#模型进行存储和读取的地方 IMAGE_SIZE=30 # imgs,labels,counter=read_file(PATH,IMAGE_SIZE) dataset=pd.read_hdf('./data/train.h5',key='train') dataset_x=dataset.iloc[:,0:900].values dataset_x=dataset_x.reshape(dataset_x.shape[0],IMAGE_SIZE,IMAGE_SIZE)/255.0 dataset_y=pd.get_dummies(dataset.iloc[:,900]).values x_train,x_test,y_train,y_test=train_test_split(dataset_x,dataset_y,test_size=0.2,random_state=0) model=Sequential() model.add(Convolution1D(filters=32,kernel_size=3,padding='same',input_shape=x_train.shape[1:])) model.add(Activation('relu')) model.add(MaxPooling1D(pool_size=2,strides=2,padding='same')) model.add(Convolution1D(filters=64,kernel_size=3,padding='same')) model.add(Activation('relu')) model.add(MaxPooling1D(pool_size=2,strides=2,padding='same')) model.add(Flatten()) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(dataset_y.shape[1])) model.add(Activation('softmax')) model.summary() model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression import seaborn as sns import numpy as np import matplotlib.pyplot as plt nyc2 = pd.read_csv("ave_yearly_temp_nyc_1895-2017.csv") # One column is a series X_train, X_test, y_train, y_test = train_test_split( nyc2.Date.values.reshape(-1, 1), nyc2.Value.values, random_state=11 ) linear_regression = LinearRegression() linear_regression.fit(X=X_train, y=y_train) print(linear_regression.coef_) print(linear_regression.intercept_) predicted = linear_regression.predict(X_test) expected = y_test for p, e in zip( predicted[::5], expected[::5] ): # double colons say print every 5th element print(f"Predicted: {p:.2f}, expected: {e:.2f}") # lambda implements y = mx + b predict = lambda x: linear_regression.coef_ * x + linear_regression.intercept_ axes = sns.scatterplot(
dataset = pd.read_csv( r'C:\Sid Data\BITS\4th Sem\Udemey\Part 3 - Classification\Section 14 - Logistic Regression\Social_Network_Ads.csv' ) # Ned to get age & salary correspondence as X x_data = dataset.iloc[:, [2, 3]].values print(x_data) #x_data1 = dataset.iloc[:, :-1].values row, columns = dataset.shape column_index = columns - 1 y_data = dataset.iloc[:, column_index].values print(y_data) # Split Train & test set from sklearn.model_selection import train_test_split x_data_train, x_data_test, y_data_train, y_data_test = train_test_split( x_data, y_data, test_size=0.25, random_state=0) #scale the data for running algorithms from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x_data_train = sc_x.fit_transform(pd.DataFrame(x_data_train)) x_data_test = sc_x.fit_transform(pd.DataFrame(x_data_test)) print(x_data) print(x_data_train) #Logistic Regression------------------------------------- from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(random_state=0) log_reg.fit(x_data_train, y_data_train) #Predict test results y_data_pred = log_reg.predict(x_data_test)
preprocess_text(technology, sentences, 'technology') preprocess_text(car, sentences, 'car') preprocess_text(entertainment, sentences, 'entertainment') preprocess_text(military, sentences, 'military') preprocess_text(sports, sentences, 'sports') """ generate data set """ # Shuffle the order to produce a more reliable training set random.shuffle(sentences) for sentence in sentences: print(sentence[0], sentence[1]) # Divide the original data set into test sets of training sets, using sklearn's own segmentation function "Zip!" content, tag = zip(*sentences) content_train, content_test, tag_train, tag_test = train_test_split( content, tag, random_state=1234) print(len(content_train)) # To extract useful features from noise reduction data, we extract bag of words model features from the text vectorizer = CountVectorizer( analyzer='word', # tokenize by character ngrams ngram_range=(1, 4), # use ngrams of size 1 2 and 3 max_features=20000) # keep the most common 1000 ngrams vectorizer.fit(content_train) def get_features(content): vectorizer.transform(content) # import classifier and train data
plt.legend(loc="best") return plt # digits = load_digits() # X, y = digits.data, digits.target array = data.values X = array[:,0:4] y = array[:,4] validation_size = .2 seed = 13 scoring = 'accuracy' X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X,y,test_size=validation_size, random_state=seed) cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) title = "Learning Curves (Naive Bayes)" estimator = GaussianNB() plot_learning_curve(estimator, title, X_train, Y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=4) title = "Learning curve (K Nearest Neighbours)" estimator = KNeighborsClassifier() plot_learning_curve(estimator, title, X_train, Y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=4) plt.show()
data.isnull().values.any() data[data.isnull().any(axis=1)] data ['a'] = pd.DataFrame({'a':range(30001)}) sampled_df = data[(data['a'] % 10) == 0] sampled_df.shape sampled_df_remaining = data[(data['a'] % 10) != 0] sampled_df_remaining.shape LoanY = sampled_df['default payment next month'].copy() loan_features = ['LIMIT_BAL','SEX','EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'] LoanX = sampled_df[loan_features].copy() classifed_names= ['default payment next month'] trainTest = train_test_split(LoanX, LoanY, test_size=0.3, train_size=0.7, random_state=0) from sklearn.model_selection import GridSearchCV, learning_curve from datetime import datetime from sklearn.metrics import confusion_matrix cross_validations = 10 train_sizes_base = [100, 200, 400, 600,800,1000,1200,1400] def plot_learning_curve(title, cv_curve): # _, _, test_scores_base = base_curve train_sizes, train_scores, test_scores = cv_curve train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1)
num_features = X.shape[1] standardise = StandardScaler() X = standardise.fit_transform(X) # Labels need to be 0-4, not 4-5 for categorical y -= 1 y = tf.keras.utils.to_categorical(y).astype(np.float64) # Tuning H accuracies = defaultdict(list) for H in range(0, 10, 2): # check H in {0, 2, 4, 6, 8} for i in range(10): # 10-fold cross-validation print(f'H: {H}, Fold {i+1}') train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.1) model = BfgsMlp(n_input=num_features, n_hidden=H, n_output=num_classes) model.fit(train_x, train_y, max_iterations=100) test_acc = model.accuracy(test_x, test_y) accuracies[H].append(test_acc) accuracies = {H: sum(accs) / len(accs) for H, accs in accuracies.items()} for H, acc in accuracies.items(): print(f'Average accuracy for H = {H}: {acc*100:.2f}%') # Once best H value is found make a model with this H and monitor training to check for over-fitting train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.1) best_H = max(accuracies, key=accuracies.get)
for i in range(0, temp_row): for j in range(2, temp_col-1): x_temp.append(df.iloc[i, j]) x.append( x_temp ) y.append( df.iloc[i, -1] ) x_temp = [] print ('x=') print (x) print ('y=') print (y) MinMax = MinMaxScaler() x_new = MinMax.fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.25) print ('y_train =') print (y_train) print ('y_test =') print (y_test) cv = StratifiedKFold(n_splits= 5, shuffle= True) C = np.arange(0.5, 5, 0.5) param_grid = dict(C = C) kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 7) model = SVR() grid_search = GridSearchCV(model, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1, cv = kfold) y_train = np.zeros(len(y_train))
if sigmoid(X[i], theta) > 0.5: y[i] = 1 return y return theta def accuracy(X_test, y_test): y_pred = predict(X_test, theta) return accuracy_score(y_test, y_pred) # X_train, X_test, y_train, y_test np.ones(n) m, n = np.shape(X) X_ex = np.c_[X, np.ones(m)] # 扩展矩阵为 [x, 1] #print (X_ex) X_train, X_test, y_train, y_test = model_selection.train_test_split( X_ex, y, test_size=0.5, random_state=0) # 通过梯度下降法得到最优参数 theta = gradDscent_1(X_train, y_train) # 做出预测 映射theta y_pred = predict(X_test, theta) m_test = np.shape(X_test)[0] # 混淆矩阵的计算和预测精度 cfmat = np.zeros((2, 2)) for i in range(m_test): if y_pred[i] == y_test[i] == 0: cfmat[0, 0] += 1 elif y_pred[i] == y_test[i] == 1: cfmat[1, 1] += 1 elif y_pred[i] == 0: cfmat[1, 0] += 1 elif y_pred[i] == 1:
y = dataset.iloc[:, 3].values # Taking care of missing data #from sklearn.preprocessing import Imputer #imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) #imputer = imputer.fit(X[:, 1:3]) #X[:, 1:3] = imputer.transform(X[:, 1:3]) # Encoding categorical data # Encoding the Independent Variable #from sklearn.preprocessing import LabelEncoder, OneHotEncoder #labelencoder_X = LabelEncoder() #X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) #onehotencoder = OneHotEncoder(categorical_features = [0]) #X = onehotencoder.fit_transform(X).toarray() ## Encoding the Dependent Variable #labelencoder_y = LabelEncoder() #y = labelencoder_y.fit_transform(y) #splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #Feature Scaling #from sklearn.preprocessing import StandardScaler #sc_X = StandardScaler() #X_train = sc_X.fit_transform(X_train) #X_test = sc_X.transform(X_test)
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score import time import csv from sklearn.neural_network import MLPRegressor start = time.perf_counter() train = pd.read_csv('train.csv') X = train.drop(columns=['winPlacePerc']) Y = train['winPlacePerc'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) clf = MLPRegressor(hidden_layer_sizes=( 300, 200, 100, 50, ), activation='relu', solver='adam', alpha=0.0001, batch_size=100, learning_rate='constant', learning_rate_init=0.001, max_iter=200, shuffle=True, verbose=True, early_stopping=True,
def main(): print 'Using Keras version: ', keras.__version__ usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument('-t', '--train_model', dest='train_model', help='Option to train model or simply make diagnostic plots (0=False, 1=True)', default=0, type=int) parser.add_argument('-w', '--classweights', dest='classweights', help='Option to choose class weights', default='InverseSRYields', type=str) parser.add_argument('-s', '--sel', dest='selection', help='Option to choose selection', default='tH', type=str) args = parser.parse_args() do_model_fit = args.train_model classweights_name = args.classweights selection = args.selection # Number of classes to use number_of_classes = 4 # Create instance of output directory where all results are saved. output_directory = '2017samples_%s_%s/' % (selection,classweights_name) check_dir(output_directory) # Create plots subdirectory plots_dir = os.path.join(output_directory,'plots/') input_var_jsonFile = open('input_vars_SigRegion_wFwdJet.json','r') if selection == 'tH': selection_criteria = '(is_tH_like_and_not_ttH_like==0 || is_tH_like_and_not_ttH_like==1)'#&& n_presel_jet>=3' # Load Variables from .json variable_list = json.load(input_var_jsonFile,encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key,var in variable_list: column_headers.append(key) column_headers.append('EventWeight') column_headers.append('xsec_rwgt') column_headers.append('nEvent') # Create instance of the input files directory inputs_file_path = '/afs/cern.ch/work/j/jthomasw/private/IHEP/ttHML/github/ttH_multilepton/keras-DNN/samples/rootplas_LegacyMVA_1113/DiLepRegion/ttH2017TrainDNN2L/' # Load ttree into .csv including all variables listed in column_headers print '<train-DNN> Input file path: ', inputs_file_path outputdataframe_name = '%s/output_dataframe_%s.csv' %(output_directory,selection) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print '<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name) else: print '<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path) data = load_data(inputs_file_path,column_headers,selection_criteria) data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) # Make instance of plotter tool Plotter = plotter() # Create statistically independant lists train/test data (used to train/evaluate the network) traindataset, valdataset = train_test_split(data, test_size=0.2) #valdataset.to_csv('valid_dataset.csv', index=False) #print '<train-DNN> Training dataset shape: ', traindataset.shape #print '<train-DNN> Validation dataset shape: ', valdataset.shape # Remove last two columns (Event weight and xsrw) from column headers training_columns = column_headers[:-3] print '<train-DNN> Training features: ', training_columns # Select data from columns under the remaining column headers in traindataset X_train = traindataset[training_columns].values # Select data from 'target' as target for MVA Y_train = traindataset.target.astype(int) X_test = valdataset[training_columns].values Y_test = valdataset.target.astype(int) num_variables = len(training_columns) # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] train_df.drop(['EventWeight'], axis=1, inplace=True) train_df.drop(['xsec_rwgt'], axis=1, inplace=True) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot.png' #Plotter.correlation_matrix(train_df) #Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name) #sampleweights = traindataset.loc[:,'sampleWeight']*traindataset.loc[:,'EventWeight'] sampleweights = traindataset.loc[:,'sampleWeight'] sampleweights = np.array(sampleweights) # Dictionaries of class weights to combat class imbalance if classweights_name == 'balanced': tuned_weighted = class_weight.compute_class_weight('balanced', np.unique([0,1,2,3]), Y_train) if classweights_name == 'tunedweights': tuned_weighted = {0 : 7.67, 1 : 1.0, 2 : 4.62, 3 : 7.67} # Per instance weights calculation so we can correctly apply event weights to diagnostic plots train_weights = traindataset['EventWeight'].values * traindataset['xsec_rwgt'].values test_weights = valdataset['EventWeight'].values * valdataset['xsec_rwgt'].values # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) # Transform to one hot encoded arrays Y_train = np_utils.to_categorical(encoded_Y) Y_test = np_utils.to_categorical(encoded_Y_test) optimizer = 'Adam'#'Nadam' if do_model_fit == 1: histories = [] labels = [] # Define model and early stopping early_stopping_monitor = EarlyStopping(patience=100,monitor='val_loss',verbose=1) model3 = baseline_model(num_variables,optimizer,number_of_classes) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epochs = One pass over data (useful for periodic logging and evaluation) #history3 = model3.fit(X_train,Y_train,validation_split=0.2,epochs=500,batch_size=1000,verbose=1,shuffle=True,class_weight=tuned_weighted,callbacks=[early_stopping_monitor]) history3 = model3.fit(X_train,Y_train,validation_split=0.2,epochs=300,batch_size=1500,verbose=1,shuffle=True,sample_weight=sampleweights,callbacks=[early_stopping_monitor]) histories.append(history3) labels.append(optimizer) # Make plot of loss function evolution Plotter.plot_training_progress_acc(histories, labels) acc_progress_filename = 'DNN_acc_wrt_epoch.png' Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename) # Which model do you want the rest of the plots for? model = model3 else: # Which model do you want to load? model_name = os.path.join(output_directory,'model.h5') print '<train-DNN> Loaded Model: %s' % (model_name) model = load_trained_model(model_name,num_variables,optimizer,number_of_classes) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory,'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory,'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory,'model_serialised.json') with open(model_json_name,'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory,'model_schematic.png') plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory # Make overfitting plots of output nodes Plotter.overfitting(model, Y_train, Y_test, result_probs, result_probs_test, plots_dir, train_weights, test_weights) # Get true process values for testing dataset original_encoded_test_Y = [] for i in xrange(len(result_probs_test)): if Y_test[i][0] == 1: original_encoded_test_Y.append(0) if Y_test[i][1] == 1: original_encoded_test_Y.append(1) if Y_test[i][2] == 1: original_encoded_test_Y.append(2) if Y_test[i][3] == 1: original_encoded_test_Y.append(3) # Get true process integers for training dataset original_encoded_train_Y = [] for i in xrange(len(result_probs)): if Y_train[i][0] == 1: original_encoded_train_Y.append(0) if Y_train[i][1] == 1: original_encoded_train_Y.append(1) if Y_train[i][2] == 1: original_encoded_train_Y.append(2) if Y_train[i][3] == 1: original_encoded_train_Y.append(3) # Get true class values for testing dataset result_classes_test = newencoder.inverse_transform(result_classes_test) result_classes_train = newencoder.inverse_transform(result_classes) # Create confusion matrices for training and testing performance Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'index') Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TRAIN.png') Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'index') Plotter.save_plots(dir=plots_dir, filename='yields_norm_confusion_matrix_TEST.png') Plotter.conf_matrix(original_encoded_train_Y,result_classes_train,train_weights,'') Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TRAIN.png') Plotter.conf_matrix(original_encoded_test_Y,result_classes_test,test_weights,'') Plotter.save_plots(dir=plots_dir, filename='yields_matrix_TEST.png') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 0 , 'ttHnode') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 1 , 'Other') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 2 , 'ttWnode') Plotter.ROC_sklearn(original_encoded_train_Y, result_probs, original_encoded_test_Y, result_probs_test, 3 , 'tHQnode')
df.loc[-1] = ["covid", ('covid/' + filename)] df.index = df.index + 1 df = df.sort_index() for dirpath, dirnames, filenames in os.walk('../input/covid19-pneumonia-normal-chest-xray-pa-dataset/pneumonia'): for filename in filenames: df.loc[-1] = ["pneumonia", ('pneumonia/' + filename)] df.index = df.index + 1 df = df.sort_index() df_y = df['class'] df_x = df['directory'] """## Data preprocessing""" X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, stratify=df_y, test_size=0.20, random_state=7) df_x, df_y = X_train, Y_train test = pd.concat([X_test, Y_test], axis = 1) test.head() # used to copy files according to each fold def copy_images(df, directory): # input and output directory input_path = "../input/covid19-pneumonia-normal-chest-xray-pa-dataset" output_path = "out/" + directory # remove all files from previous fold if os.path.exists(output_path):
# master_df = pd.concat([master_df, df], axis=1) # This code is for the normalized data for key in teste_list: df = pd.read_table(key, header=None) master_df = pd.concat([master_df, df], axis=1) target_df = pd.read_table('profile.csv', header=None) target_df = target_df.drop([1, 2, 3, 4], axis=1) print(master_df) # print(master_df.dtypes) # print(target_df.dtypes) x_train, x_test, y_train, y_test = train_test_split(master_df, target_df, test_size=0.3, random_state=42) knn = KNeighborsClassifier(n_neighbors=4) knn.fit(x_train, y_train.values.ravel()) print(knn.score(x_test, y_test)) pred = knn.predict(x_test) y_train_pred = knn.predict(x_train) cm_train = confusion_matrix(y_train, y_train_pred) cm_test = confusion_matrix(y_test, pred) print('confusion matrix teste\n', cm_test, '\n') print('confusion matrix treino\n', cm_train, '\n') print(classification_report(y_test, pred), '\n') stop = timeit.default_timer() print('Programa executado em ', stop - start, 'segundos\n')
import pandas as pd #para dividir los datos en train y test from sklearn.model_selection import train_test_split from sklearn import svm from sklearn.model_selection import GridSearchCV #from sklearn.metrics import classification_report,confusion_matrix train = pd.read_csv('../data/fashion-mnist_train.csv') train.head() train.describe().transpose() X = train.drop('label', axis=1) y = train['label'] #divide los datos de entrenamiento y los datos de pruebas X_train, X_test, y_train, y_test = train_test_split(X, y) ################################################################################ # Set the parameters by cross-validation param_grid = [{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] clf = GridSearchCV(svm.SVC(C=1), param_grid, n_jobs=4, refit=True) clf.fit(X_train, y_train) print("------fin-----")
with open('labeler/targets_large.csv') as handle: reader = csv.DictReader(handle) for row in reader: train_imgs.append('./labeler/car_ims/{}'.format(row['img'].strip())) # train_imgs.append(row['img'].strip()) train_labels.append(row['color']) print("loaded imgs and labels") X, y = read_and_process_image(train_imgs, train_labels) X = np.array(X) y = np.array(y) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=2) y_train = to_categorical(y_train) y_val = to_categorical(y_val) print("Shape of train images is:", X_train.shape) print("Shape of validation images is:", X_val.shape) print("Shape of labels is:", y_train.shape) print("Shape of labels is:", y_val.shape) ntrain = len(X_train) nval = len(X_val) batch_size = 8 columns = 5
# Loop through each training image for the current person for person_img in pix: try: face = face_recognition.load_image_file("./train/" + person + "/" + person_img) face_enc = face_recognition.face_encodings(face)[0] # Add face encoding for current image with corresponding label (name) to the training data encodings.append(face_enc) names.append(person) except: pass X_train, X_test, y_train, y_test = train_test_split(encodings, names, test_size=0.2, random_state=42) print("train_test_split completed \n ") # Create and train the SVC classifier print("load dataset completed \n training model started") clf = svm.SVC(gamma='scale') clf.fit(X_train, y_train) print("model training completed \n saving the model") y_pred = clf.predict(X_test) #precision = average_precision_score(y_test, y_pred) #print("average precision score is:") #print(precision)
features = dataset.iloc[:, 2:].values labels = dataset.iloc[:, 1] ## label encoding from sklearn.preprocessing import LabelEncoder labelencoder = LabelEncoder() labels = labelencoder.fit_transform(labels) print(labels) labels= np.array( labels, dtype= np.float64) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() features_train = sc.fit_transform(features_train) features_test = sc.transform(features_test) """ Unregularized model - Linear regressor Regularized model - Ridge regression and lasso """ from sklearn.linear_model import LinearRegression from sklearn.linear_model import Lasso from sklearn.linear_model import Ridge # RidgeClassier is also there
normalized.head() #pdb.set_trace() normalized['Age'] = pd.DataFrame(df, columns=['Age']) #print(normalized) normalized['Gender'] = pd.DataFrame(df, columns=['Gender']) #print(normalized) normalized['Dataset'] = pd.DataFrame(df, columns=['Dataset']) print(normalized) #import pdb; #pdb.set_trace() #train_test_split comes from scikit library. It is possible to specify # test_Size or train_size. In this case we are defining the test size as 20% #of the dataset train, test = train_test_split(normalized, test_size=0.2) #validation dataset will be 20% of the train dataset train, val = train_test_split(train, test_size=0.1) print(len(train), 'train examples') print(len(val), 'validation examples') print(len(test), 'test examples') # A utility method to create a tf.data dataset from a Pandas Dataframe def df_to_dataset(dataframe, shuffle=True, batch_size=32): dataframe = dataframe.copy() labels = dataframe.pop('Dataset') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)) ds = ds.batch(batch_size)
def _load(self, small=True): df = pd.read_csv(os.path.join(self.datadir,'../../data/higgs/higgs-boson.csv')) cols_to_keep = ['DER_mass_MMC', 'DER_mass_transverse_met_lep','DER_mass_vis', 'DER_pt_h', 'DER_deltar_tau_lep','DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau','DER_met_phi_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet','Label'] if small: fr = .15 else: fr = 1 df['Label'] = df['Label'].replace('s',1) df['Label'] = df['Label'].replace('b',0) df = df[cols_to_keep].sample(frac=fr, random_state = 100) train, test = train_test_split(df, test_size=.25, random_state=100) X_train = np.array(train.iloc[:,:-1]) y_train = np.array(train.iloc[:,-1]) X_test = np.array(test.iloc[:,:-1]) y_test = np.array(test.iloc[:,-1]) scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) fmt = ['%.18f']* (X_train.shape[1]) + ['%d'] if(self.ass2): df = pd.read_csv(os.path.join(self.datadir,'../../../ml_randomized_optimization/ABAGAIL/src/exp/tests/higgs_train.txt'), header=None, sep=',') train = np.array(df.iloc[:-1250,:]) test = np.array(df.iloc[-1250:,:]) train = np.array(train) test = np.array(test) if(self.to_txt): self._send_to_txt(train[:5000], './higgs_train.txt', fmt=fmt) self._send_to_txt(test, './higgs_test.txt', fmt=fmt) self.train_data = { 'features': train[:,:-1], 'labels': train[:,-1] } self.test_data = { 'features': test[:,:-1], 'labels': test[:,-1] } else: self.train_data = { 'features': X_train, 'labels': y_train } self.test_data = { 'features': X_test, 'labels': y_test }
"upper class": 3 } }) df_cust = df_cust.replace(transformation) df_features = df_cust.loc[:, df_cust.columns.difference(["outcome"])] df_target = df_cust["outcome"] rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=20) rf.fit(df_features, df_target) top_k_feats = rf.feature_importances_.argsort()[-10:] X, y = df_cust.loc[:, df_features.columns[top_k_feats]], df_target X_train, X_test, y_train, y_test = \ train_test_split(X, y, train_size=.8, shuffle=True, stratify=df_target) scaler = MinMaxScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32) y_tensor = torch.tensor(y_train.values, dtype=torch.float32) k = X_tensor.shape[1] - 1 frequentist_model = TorchLogisticRegression(k) q = AutoDiagonalNormal(bayes_logistic) svi = SVI(bayes_logistic, q, Adam({"lr": 1e-2}), loss=Trace_ELBO(),
#init paths data_path = './data/' image_path = 'IMG/' #get the steering data, throtlle and brake from csv file with open(data_path + 'driving_log.csv', 'r', newline='') as f: reader = csv.reader(f, delimiter=',') for line in reader: samples.append(line) #shuffle samples samples = shuffle(samples) #split the train and the validation sets - 80/20 percentage train_samples, validation_samples = train_test_split(samples, test_size=0.2) #set up variables for generators to be used later for the fit_generator function train_generator = generator(train_samples, batch_size=32) validation_generator = generator(validation_samples, batch_size=32) ### retreive model #### my_model = Path('./model.h5') if my_model.is_file(): model = load_model('model.h5') else:
X X.shape # help(X.std) # X.std(axis=0) # So, that's why we skip scaling. y y.shape ### Data types and missing values identification X.dtype # X.isnull.sum() # {0:.2f}.format() from sklearn.model_selection import train_test_split import pandas as pd X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14) print("There are {} samples in the training dataset".format(X_train.shape[0])) print("There are {} samples in the testing dataset".format(X_test.shape[0])) print("Each sample has {} features".format(X_train.shape[1])) print("The class distribution of training set is\n{}.".format(y.value_counts() / len(y))) print("The class distribution of training set is\n{}.".format(y_train.value_counts() / len(y_train))) print("The class distribution of test set is\n{}.".format(y_test.value_counts() / len(y_test))) ### Standardization from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train_std = sc.fit_transform(X_train)
def plot_cross_val_selection(): iris = load_iris() X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0) param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid_search = GridSearchCV(SVC(), param_grid, cv=5) grid_search.fit(X_trainval, y_trainval) results = pd.DataFrame(grid_search.cv_results_)[15:] best = np.argmax(results.mean_test_score.values) plt.figure(figsize=(10, 3)) plt.xlim(-1, len(results)) plt.ylim(0, 1.1) for i, (_, row) in enumerate(results.iterrows()): scores = row[['test_split%d_test_score' % i for i in range(5)]] marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5, alpha=.5) marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1, markersize=10, markeredgecolor='k') if i == best: marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red', fillstyle="none", alpha=1, markersize=20, markeredgewidth=3) plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x in grid_search.cv_results_['params']], rotation=90) plt.ylabel("Validation accuracy") plt.xlabel("Parameter settings") plt.legend([marker_cv, marker_mean, marker_best], ["cv accuracy", "mean accuracy", "best parameter setting"], loc=(1.05, .4))