def test_kmean(): train_set, valid_set, test_set = DatasetManager.read_dataset(dataset_name="dataset_simulation_20.csv", shared=False) kmean = KMean() _clusters, _centers = kmean.run( dataset=train_set[0], n_clusters=5, max_iters=100, threshold=1.0 ) assert _clusters assert _centers is not None
def run_experiment(): from datasets import DatasetManager from preprocessing.scaling import get_gaussian_normalization from preprocessing.dimensionality_reduction import get_pca train_set, valid_set, test_set = DatasetManager.read_dataset() dataset, result = train_set # Reduce to a 2D dimensionality for plotting the data dataset = get_gaussian_normalization(dataset) # dataset = get_LLE(dataset, num_components=2, n_neighbors=80) dataset, explained_variance_ratio_ = get_pca(dataset, num_components=2) plot_embedding(dataset, result)
def theano_experiments(): dataset_name = 'cinvestav_labeled.csv' seed = 5 rgn = numpy.random.RandomState(seed) datasets = DatasetManager.read_dataset( dataset_name=os.path.join(os.path.dirname(__file__), 'dataset', 'meters', dataset_name), shared=True, seed=seed, expected_output=['result_x', 'result_y'], skipped_columns=[], label_encoding_columns_name=[], sklearn_preprocessing=preprocessing.StandardScaler(with_mean=True, with_std=True), sklearn_feature_selection=feature_selection.VarianceThreshold(), train_ratio=.8, test_ratio=0, valid_ratio=.2 ) test_set = DatasetManager.get_prediction_set( dataset_name=os.path.join(os.path.dirname(__file__), 'dataset', 'meters', 'cinvestav_labeled_test.csv'), expected_output=['result_x', 'result_y'], label_encoding_columns_name=[], skipped_columns=[], sklearn_preprocessing=datasets['sklearn_preprocessing'], sklearn_feature_selection=datasets['sklearn_feature_selection'], shared=True ) dataset_unlabeled = DatasetManager.get_prediction_set( dataset_name=os.path.join(os.path.dirname(__file__), "dataset", 'cinvestav_unlabeled.csv'), skipped_columns=['result_x', 'result_y'], label_encoding_columns_name=[], sklearn_preprocessing=datasets['sklearn_preprocessing'], sklearn_feature_selection=datasets['sklearn_feature_selection'], shared=True ) datasets['test_set'] = test_set datasets['dataset_unlabeled'] = dataset_unlabeled datasets['prediction_set'] = datasets['test_set'][0].get_value() train_set_x, train_set_y = datasets['train_set'] n_in = train_set_x.get_value().shape[1] n_out = train_set_y.get_value().shape[1] dnn_tanh_models = get_neural_networks( n_in, n_out, rgn, activation_function=T.tanh # T.nnet.relu ) dnn_relu_models = get_neural_networks( n_in, n_out, rgn, activation_function=T.nnet.relu ) dnn_sigmoid_models = get_neural_networks( n_in, n_out, rgn, activation_function=T.nnet.sigmoid ) dbn_models = get_dbn( n_in, n_out, rgn, gaussian=False ) gdbn_models = get_dbn( n_in, n_out, rgn, gaussian=True ) models = [] models.extend(dnn_relu_models) models.extend(dnn_sigmoid_models) models.extend(dnn_tanh_models) models.extend(gdbn_models) models.extend(dbn_models) params = { 'learning_rate': .01, 'annealing_learning_rate': .99999, 'l1_learning_rate': 0.01, 'l2_learning_rate': 0.001, 'n_epochs': 2000, 'batch_size': 20, 'pre_training_epochs': 50, 'pre_train_lr': 0.01, 'k': 1, 'datasets': datasets, 'noise_rate': .1, 'dropout_rate': None } run_theano_experiments( models=models, seed=seed, params=params, experiment_name='all_models_with_noise_without_dropout', task_type='regression' )
def sklearn_experiments(): dataset_name = 'cinvestav_labeled.csv' seed = 5 datasets = DatasetManager.read_dataset( dataset_name=os.path.join(os.path.dirname(__file__), "dataset", dataset_name), shared=False, seed=seed, expected_output=['result_x', 'result_y'], skipped_columns=[], label_encoding_columns_name=[], sklearn_preprocessing=preprocessing.StandardScaler(with_mean=True, with_std=True), sklearn_feature_selection=feature_selection.VarianceThreshold(), train_ratio=1, test_ratio=0, valid_ratio=0 ) test_set = DatasetManager.get_prediction_set( dataset_name=os.path.join(os.path.dirname(__file__), "dataset", 'cinvestav_labeled_test.csv'), expected_output=['result_x', 'result_y'], label_encoding_columns_name=[], skipped_columns=[], shared=False, sklearn_preprocessing=datasets['sklearn_preprocessing'], sklearn_feature_selection=datasets['sklearn_feature_selection'], ) datasets['test_set'] = test_set datasets['prediction_set'] = datasets['test_set'][0] train_set_x, train_set_y = datasets['train_set'] n_in = train_set_x.shape[1] n_out = train_set_y.shape[1] # Create Radial Basis Networks rbf = RBF( input_length=n_in, hidden_length=500, out_lenght=n_out ) # Create KNN knn = SklearnNetwork( sklearn_model=KNeighborsRegressor(n_neighbors=10), num_output=n_out ) # Create ada boosting ada_boosting = SklearnNetwork( sklearn_model=GradientBoostingRegressor(n_estimators=1000, learning_rate=.1, max_depth=5, loss='ls'), num_output=n_out ) models = [ ('Ada Boosting', ada_boosting), ('Radar', knn), ('cRBF', rbf) ] params = { 'datasets': datasets } run_experiments_sklearn( models=models, seed=seed, params=params, experiment_name='traditional_algorithms', task_type='regression' )