def train_test_plot(data, plot=False): trainX, trainY, testX, testY = splitData(data) clf = ensemble.GradientBoostingRegressor(**params) clf.fit(trainX, trainY) mse = mean_squared_error(testY, clf.predict(testX)) print("MSE: %.4f" % mse) if plot: ############################################################################### # Plot training deviance # compute test set deviance test_score = np.zeros((params['n_estimators'],), dtype=np.float64) for i, y_pred in enumerate(clf.staged_predict(testX)): test_score[i] = clf.loss_(testY, y_pred) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title('Deviance') plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-', label='Training Set Deviance') plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') plt.legend(loc='upper right') plt.xlabel('Boosting Iterations') plt.ylabel('Deviance') return clf
def load_data(self): cfg = basic_config(self.newDir) if self.method == 'faster': cfg.faster_rcnn_config() elif self.method == 'fast': cfg.fast_rcnn_config() elif self.method == 'normal': cfg.rcnn_config() cfg = mcfg.rcnn_hoi_classes(cfg) cfg = set_config(cfg) cfg.get_args() cfg.dataset = 'HICO' cfg.update_paths() trainMeta = utils.load_dict(cfg.data_path + 'train') testMeta = utils.load_dict(cfg.data_path + 'test') trainGTMeta = utils.load_dict(cfg.data_path + 'train_GT') testGTMeta = utils.load_dict(cfg.data_path + 'test_GT') labels = utils.load_dict(cfg.data_path + 'labels') class_mapping = utils.load_dict(cfg.data_path + 'class_mapping') if cfg.max_classes is not None: # Reduce data to include only max_classes number of different classes _, counts = utils.getLabelStats(trainGTMeta, labels) reduced_idxs = utils.getReducedIdxs(counts, cfg.max_classes, labels) trainGTMeta = utils.reduceData(trainGTMeta, reduced_idxs) testGTMeta = utils.reduceData(testGTMeta, reduced_idxs) trainMeta = utils.reduceData(trainMeta, reduced_idxs) testMeta = utils.reduceData(testMeta, reduced_idxs) labels = utils.idxs2labels(reduced_idxs, labels) cfg.nb_classes = len(labels) cfg.set_class_weights(labels, trainGTMeta) _, valMeta = utils.splitData(list(trainMeta.keys()), trainMeta) self.cfg = cfg if cfg.move: self.move_data() print('Data:', cfg.data_path) print('Path:', cfg.my_results_path) self.labels = labels self.class_mapping = class_mapping self.trainMeta = trainMeta self.valMeta = valMeta self.testMeta = testMeta self.trainGTMeta = trainGTMeta self.testGTMeta = testGTMeta
def runTests(X, Y): Xtr, Xte, Ytr, Yte = utils.splitData(X, Y, 0.9) print("X and Y shapes =", X.shape, Y.shape) results, estimator = nn.train(Xtr, Ytr) print(results) estimator.fit(Xtr, Ytr) Yhat = estimator.predict(Xte) mse = utils.mse(Yte, Yhat) print("mse on testing data is", mse) return ( results, mse, )
def load_data(self): cfg = basic_config(self.newDir) cfg = set_config(cfg) cfg.get_args() cfg.update_paths() trainMeta = utils.load_dict(cfg.data_path + 'train') testMeta = utils.load_dict(cfg.data_path + 'test') trainGTMeta = utils.load_dict(cfg.data_path + 'train_GT') testGTMeta = utils.load_dict(cfg.data_path + 'test_GT') labels = utils.load_dict(cfg.data_path + 'labels') if cfg.max_classes is not None: # Reduce data to include only max_classes number of different classes _, counts = utils.getLabelStats(trainGTMeta, labels) trainGTMeta, reduced_idxs = utils.reduceTrainData( trainGTMeta, counts, cfg.max_classes) testGTMeta = utils.reduceTestData(testGTMeta, reduced_idxs) trainMeta = utils.reduceTestData(trainMeta, reduced_idxs) testMeta = utils.reduceTestData(testMeta, reduced_idxs) labels = utils.idxs2labels(reduced_idxs, labels) cfg.nb_classes = len(labels) cfg.set_class_weights(labels, trainGTMeta) _, valMeta = utils.splitData(list(trainMeta.keys()), trainMeta) self.cfg = cfg if cfg.move: self.move_data() print('Path:', cfg.my_results_path) self.labels = labels self.trainMeta = trainMeta self.valMeta = valMeta self.testMeta = testMeta self.trainGTMeta = trainGTMeta self.testGTMeta = testGTMeta
# -*- coding: utf-8 -*- """ Created on Sun May 22 17:25:28 2016 @author: Haolin """ import numpy as np import pandas as pd from sklearn import linear_model from sklearn.metrics import mean_squared_error from utils import splitData if __name__ == '__main__': data = pd.read_csv('../out/artists_all_features.csv', index_col=['artist_id', 'date']) data.sortlevel(level=1, inplace=True) trainX, trainY, testX, testY = splitData(data) clf = linear_model.LassoCV(max_iter=5000) clf.fit(trainX, trainY) mse = mean_squared_error(testY, clf.predict(testX)) print("MSE: %.4f" % mse) sortedCoef = np.sort(clf.coef_) argSoredCoef = np.argsort(clf.coef_) print sortedCoef print argSoredCoef
def __init__(self, filename, inputVariableNames, outputVariableName): self.__filename = filename self.__inputs, self.__outputs = self.readData(inputVariableNames, outputVariableName) self.__trainInputs, self.__trainOutputs, self.__testInputs, self.__testOutputs = splitData( self.__inputs, self.__outputs)
#%% Save base train data to file np.save(os.path.join(cf.DATA_DIR, 'some_voxs.npy'), all_voxs, allow_pickle=True) np.save(os.path.join(cf.DATA_DIR, 'some_mids.npy'), all_mids, allow_pickle=True) #%% Load base train data from file prefix = 'all' if cf.REMOTE else 'some' all_voxs = np.load(os.path.join(save_dir, prefix+'_voxs.npy'), allow_pickle=True) all_mids = np.load(os.path.join(save_dir, prefix+'_mids.npy'), allow_pickle=True) #%% Setup datasets voxs_stacked = np.stack(all_voxs, axis=0) train_dataset = tf.data.Dataset.from_tensor_slices((voxs_stacked, all_mids)) for test_samples, test_labels in train_dataset.batch(50).take(1) : pass test_samples = tf.cast(test_samples, dtype=tf.float32) train_dataset, test_dataset = ut.splitData(train_dataset, 0.1) train_dataset = train_dataset.batch(cf_batch_size, drop_remainder=True) test_dataset = test_dataset.batch(cf_batch_size, drop_remainder=False) total_train_batchs = 0 for _ in train_dataset : total_train_batchs += 1 #%% Show initial models sample_index = 16 ut.plotVox(test_samples[sample_index], title='Original', threshold=0.5, limits=cf_limits, save_fig=False) if (lg.total_epochs > 10) : ut.plotVox(model.reconstruct(test_samples[sample_index][None,...], training=False), limits=cf_limits, title='Recon') #%% Training methods def getTestSetLoss(dataset, batches=0) : test_losses = [] for test_x, test_label in (dataset.take(batches).shuffle(100) if batches > 0 else dataset.shuffle(100)) :
from utils import loadData, splitData from sklearn import linear_model from sklearn.metrics import mean_squared_error if __name__ == "__main__": inputs, outputs = loadData( "D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab8\\2016.csv", 'Economy (GDP per Capita)', 'Freedom', 'Happiness Score') trainInputs, trainOutputs, validationInputs, validationOutputs = splitData( inputs, outputs) xx = [[el1, el2] for el1, el2 in zip(trainInputs[0], trainInputs[1])] regressor = linear_model.SGDRegressor(alpha=0.01, max_iter=100) for _ in range(1000): regressor.partial_fit(xx, trainOutputs) w0, w1, w2 = regressor.intercept_[0], regressor.coef_[0], regressor.coef_[ 1] print('the learnt model (tool): f(x) = ', w0, ' + ', w1, ' * x1', ' + ', w2, ' * x2') computedTestOutputs = regressor.predict( [[el1, el2] for el1, el2 in zip(validationInputs[0], validationInputs[1])]) error = mean_squared_error(validationOutputs, computedTestOutputs) print('prediction error (tool): ', error)
if __name__ == '__main__': data = pd.read_csv('../out/artists_all_features.csv', index_col=['artist_id', 'date']) data.sortlevel(level=1, inplace=True) clf = train_test_plot(data) sortedFeatureImportances = np.sort(clf.feature_importances_) print sortedFeatureImportances argsortedFeatureImportances = np.argsort(clf.feature_importances_) print argsortedFeatureImportances cnt = 0 while sortedFeatureImportances[cnt] < 1e-2: cnt += 1 cols = data.columns for i in xrange(cnt): col = cols[argsortedFeatureImportances[i]] print 'del %s' % col del data[col] clf = train_test_plot(data) X, y, testX, _ = splitData(data, isTest=False) clf = ensemble.GradientBoostingRegressor(**params) clf.fit(X, y) # predictY = clf.predict(testX) # for i,v in enumerate(predictY): # if i % 50 == 0: # print v output(clf, testX)
Nsamples = InputArray.shape[0] # computing mean of the input and output data mean_out = np.mean(OutputArray[0:n_train, :]) mean_in = np.mean(InputArray[0:n_train, :]) output("mean of input / output is %.6f\t %.6f" % (mean_in, mean_out)) # treating the data InputArray /= mean_in * 2 InputArray -= 0.5 OutputArray -= mean_out (X_train,Y_train,X_test,Y_test) = splitData(InputArray, OutputArray, n_train, n_test, output) # parameters # Nx = 2^L *m, L = L-1 m = Nx // (2**(L - 1)) output('m = %d' % m) # defining the error as the relative error with respect to the # original data def error(model, X, Y): return rel_error(model, X, Y, meanY = mean_out) ##################### Building the Network ######################## (n_b_ad, n_b_2, n_b_l) = (1, 2, 3) # see the paper arXiv:1807.01883
def loadData(args, channel_first): n_slices_per_dev = args.num_slice start_ix = args.start_ix file_key = args.file_key dev_dir_list = [] dev_dir_names = os.listdir(args.root_dir) for n in dev_dir_names: tmp = os.path.join(args.root_dir, n) dev_dir_list.append(tmp) stride = args.stride n_devices = len(dev_dir_list) # locations = ["after_fft","before_fft", "output_equ", "symbols"] # locations = ["output_equ"] if channel_first: slice_dims = (2, args.slice_len) samps_to_retrieve = (n_slices_per_dev - 1) * stride + slice_dims[1] else: slice_dims = (args.slice_len, 2) samps_to_retrieve = (n_slices_per_dev - 1) * stride + slice_dims[0] x_train, y_train, x_test, y_test = [], [], [], [] split_ratio = {'train': 0.8, 'val': 0.2} for i, d in enumerate(dev_dir_list): p = os.path.join(d, args.location) pre_X_data, num_tran = dev_bin_dataset(os.path.join(p, file_key), samps_to_retrieve, start_ix, channel_first, uniform=True) X_data_pd = [] count_s = 0 for j in range(0, samps_to_retrieve, stride): if channel_first: X_data_pd.append(pre_X_data[:, j:j + slice_dims[1]]) else: X_data_pd.append(pre_X_data[j:j + slice_dims[0], :]) count_s += 1 if count_s == n_slices_per_dev: break a = X_data_pd[-1] X_data_pd = np.array(X_data_pd) y_data_pd = i * np.ones(n_slices_per_dev, ) # split one class data uniform = True x_train_pd, x_test_pd, y_train_pd, y_test_pd = [], [], [], [] if uniform: samples_per_tran = n_slices_per_dev // num_tran idx = 0 while idx + samples_per_tran <= n_slices_per_dev: x_train_per_tran, y_train_per_tran, x_test_per_tran, y_test_per_tran = utils.splitData( split_ratio, X_data_pd[i:i + samples_per_tran, :, :], y_data_pd[i:i + samples_per_tran]) if idx == 0: x_train_pd, x_test_pd = x_train_per_tran, x_test_per_tran y_train_pd, y_test_pd = y_train_per_tran, y_test_per_tran else: x_train_pd = np.concatenate((x_train_pd, x_train_per_tran), axis=0) x_test_pd = np.concatenate((x_test_pd, x_test_per_tran), axis=0) y_train_pd = np.concatenate((y_train_pd, y_train_per_tran), axis=0) y_test_pd = np.concatenate((y_test_pd, y_test_per_tran), axis=0) idx += samples_per_tran else: x_train_pd, y_train_pd, x_test_pd, y_test_pd = utils.splitData( split_ratio, X_data_pd, y_data_pd) if i == 0: x_train, x_test = x_train_pd, x_test_pd y_train, y_test = y_train_pd, y_test_pd else: x_train = np.concatenate((x_train, x_train_pd), axis=0) x_test = np.concatenate((x_test, x_test_pd), axis=0) y_train = np.concatenate((y_train, y_train_pd), axis=0) y_test = np.concatenate((y_test, y_test_pd), axis=0) del pre_X_data del X_data_pd if args.D2: if channel_first: x_train = x_train[:, :, np.newaxis, :] x_test = x_test[:, :, np.newaxis, :] else: x_train = x_train[:, np.newaxis, :, :] x_test = x_test[:, np.newaxis, :, :] y_train = np_utils.to_categorical(y_train, n_devices) y_test = np_utils.to_categorical(y_test, n_devices) return x_train, y_train, x_test, y_test, n_devices
""" import pandas as pd import numpy as np from sklearn import ensemble from sklearn.metrics import accuracy_score from utils import splitData, output params = {'n_estimators': 500, 'max_leaf_nodes': 4, 'max_depth': None, 'min_samples_split': 5, 'learning_rate': 0.1, 'subsample': 0.5} if __name__ == '__main__': data = pd.read_csv('../out/artists_all_features.csv', index_col=['artist_id', 'date']) data.sortlevel(level=1, inplace=True) clf = ensemble.GradientBoostingClassifier(**params) trainX, trainY, testX, testY = splitData(data, pickUp=-4) clf.fit(trainX, trainY) print accuracy_score(testY, clf.predict(testX)) X, y, testX, _ = splitData(data, isTest=False, pickUp=-4) clf.fit(X, y) y3 = clf.predict(testX) trainX, trainY, testX, testY = splitData(data, pickUp=-3) clf.fit(trainX, trainY) print accuracy_score(testY, clf.predict(testX)) X, y, testX, _ = splitData(data, isTest=False, pickUp=-3) clf.fit(X, y) y2 = clf.predict(testX)
def loadData(args, channel_first): n_slices_per_dev = args.num_slice start_ix = args.start_ix file_key = args.file_key dev_dir_list = [] dev_dir_names = os.listdir(args.root_dir) for n in dev_dir_names: tmp = os.path.join(args.root_dir, n) dev_dir_list.append(tmp) stride = args.stride n_devices = len(dev_dir_list) if channel_first: slice_dims = (2, args.slice_len) samps_to_retrieve = (n_slices_per_dev - 1) * stride + slice_dims[1] else: slice_dims = (args.slice_len, 2) samps_to_retrieve = (n_slices_per_dev - 1) * stride + slice_dims[0] x_train, y_train, x_test, y_test = [], [], [], [] split_ratio = {'train': 0.8, 'val': 0.2} for i, d in enumerate(dev_dir_list): pre_X_data = dev_bin_dataset(os.path.join(d, file_key), samps_to_retrieve, start_ix, channel_first) X_data_pd = [] count_s = 0 for j in range(0, samps_to_retrieve, stride): if channel_first: X_data_pd.append(pre_X_data[:, j:j + slice_dims[1]]) else: X_data_pd.append(pre_X_data[j:j + slice_dims[0], :]) count_s += 1 if count_s == n_slices_per_dev: break X_data_pd = np.array(X_data_pd) y_data_pd = i * np.ones(n_slices_per_dev, ) # split one class data x_train_pd, y_train_pd, x_test_pd, y_test_pd = utils.splitData( split_ratio, X_data_pd, y_data_pd) if i == 0: x_train, x_test = x_train_pd, x_test_pd y_train, y_test = y_train_pd, y_test_pd else: x_train = np.concatenate((x_train, x_train_pd), axis=0) x_test = np.concatenate((x_test, x_test_pd), axis=0) y_train = np.concatenate((y_train, y_train_pd), axis=0) y_test = np.concatenate((y_test, y_test_pd), axis=0) del pre_X_data del X_data_pd if args.D2: if channel_first: x_train = x_train[:, :, np.newaxis, :] x_test = x_test[:, :, np.newaxis, :] else: x_train = x_train[:, np.newaxis, :, :] x_test = x_test[:, np.newaxis, :, :] y_train = np_utils.to_categorical(y_train, n_devices) y_test = np_utils.to_categorical(y_test, n_devices) return x_train, y_train, x_test, y_test, n_devices