def silhouette_kcluster(directory, *args, **kwargs): logger = new_logger('plot_silhouette_kcluster', directory) logger.info('entering') X, y = get_mnist(directory) X /= 255. pca = PCA(n_components=50, whiten=False, random_state=42).fit(X) # PCA preprocessed X_pca = pca.transform(X) k = [10, 15, 20, 25, 30, 35, 50, 100, 200] dict_results = { 'n_clusters': [], 'pca_n_components': [], 'pca_expl_var': [], 'pca_expl_var_ratio': [], 'silhouette_kcosine': [], 'silhouette_kmeans': [], 'fittime_kcosine': [], 'fittime_kmeans': [] } for n_clusters in k: dict_results['n_clusters'].append(n_clusters) dict_results['pca_n_components'].append(pca.n_components_) dict_results['pca_expl_var'].append(np.sum(pca.explained_variance_)) dict_results['pca_expl_var_ratio'].append( np.sum(pca.explained_variance_ratio_)) # kmeans clusterer_euclid = KMeans(n_clusters=n_clusters, random_state=42) t = time.time() clusterer_euclid.fit(X_pca) dict_results['fittime_kmeans'].append(time.time() - t) dict_results['silhouette_kmeans'].append( silhouette_score(X, clusterer_euclid.predict(X_pca), metric='euclidean', random_state=42)) # kcosine clusterer_cosine = KMeans(n_clusters=n_clusters, random_state=42) t = time.time() clusterer_cosine.fit(X_pca) dict_results['fittime_kcosine'].append(time.time() - t) dict_results['silhouette_kcosine'].append( silhouette_score(X, clusterer_cosine.predict(X_pca), metric='cosine', random_state=42)) # save results to csv with open(os.path.join(directory, 'silhouette_kcluster.csv'), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def plot_variance_mean(directory, *args, **kwargs): logger = new_logger('plot_variance_mean', directory) logger.info('entering') X, y = get_mnist(directory) image_size = (28, 28) scaler = StandardScaler().fit( StandardScaler(with_std=False).fit_transform(X) / 255) fig, axs = plt.subplots(1, 2, figsize=(8, 4)) axs[0].imshow(np.resize(scaler.mean_, image_size), cmap=plt.cm.gray_r, interpolation='none') axs[0].imshow(np.resize(scaler.var_, image_size), cmap=plt.cm.gray_r, interpolation='none') axs[0].set_title(r'$\mu$') axs[1].set_title(r'$\sigma^2$') fig.tight_layout() fig.savefig( os.path.join(directory, 'mnist-pixel-variance-and-mean-avgfree.pdf')) fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pixel-variance-and-mean-avgfree.pgf'), format='pgf') logger.info('np.max(scaler.mean_) = {0}, np.max(scaler.var_) = {1}'.format( np.max(scaler.mean_), np.max(scaler.var_))) return
def main(directory, params=()): # workdir if not os.path.isdir(directory): try: os.mkdir(directory) except PermissionError as e: print('mkdir failed due to missing privileges: {0}'.format(e)) exit(1) # subfolder for results file_dir = os.path.join(directory, 'compare_datasets') if not os.path.isdir(file_dir): os.mkdir(file_dir) logger = new_logger('main', directory=file_dir) logger.info('Started main with directory={0} and params={1}'.format( directory, params)) # register parameters experiment_names = { 'dataset_imbalance': dataset_imbalance, } # run specified programs for param in params: if param in experiment_names: experiment_names[param](file_dir) else: logger.warning('Parameter {0} invalid/not found.'.format(param))
def elm_bip(directory): self_name = 'elm_bip' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # preprocessing X /= 255. pca = PCA(n_components=50).fit(X) X_preprocessed = pca.transform(X) logger.info('{0} features remaining after preprocessing.'.format( X_preprocessed.shape[1])) # prepare parameter grid param_grid = [{ 'hidden_layer_size': [500, 1000, 2000, 4000], 'activation': ['tanh'], 'alpha': [1e-5], 'random_state': [42] }] # setup estimator estimator = ELMClassifier(input_to_node=BatchIntrinsicPlasticity(), regressor=Ridge()) # setup grid search cv = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='accuracy', n_jobs=1, verbose=2, refit=False, cv=[(np.arange(0, train_size), np.arange(train_size, 70000))]) # run! cv.fit(X, y_encoded) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) # refine results cv_results = cv.cv_results_ del cv_results['params'] # save results try: with open(os.path.join(directory, '{0}.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') except PermissionError as e: print('Missing privileges: {0}'.format(e))
def plot_imbalance(directory): self_name = 'plot_imbalance' logger = new_logger(self_name, directory) X, y = get_mnist(directory) logger.info('successfully fetched {0} datapoints'.format(X.shape[0])) tp_y_unique = np.unique(y.astype(int), return_counts=True) y_unique = tp_y_unique[0][np.argsort(tp_y_unique[0])] y_counts = tp_y_unique[1][np.argsort(tp_y_unique[0])] # y_hist_arr = np.array(y_hist, dtype=float) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6., 2.1)) for idx in range(y_counts.size): plt.text(idx * 1., 3500, '{0:.1f}%'.format(y_counts[idx] / np.sum(y_counts) * 100), color=(1., 1., 1., .2), fontsize='small', horizontalalignment='center') # w = bar.get_with() # plt.text(bar.get_x() - .04, bar.get_y() + .1, '{0:.1f}%'.format()) ax.set_xlim([-.5, 9.5]) ax.set_xticks(y_unique) ax.set_xticklabels(['{0:.0f}'.format(idx) for idx in y_unique]) ax.set_xlabel('label') ax.set_ylim([0, 8000]) ax.set_yticks([7000], minor=True) ax.grid(which='minor', axis='y', alpha=.7, linestyle='--', color=tud_colors['lightgreen']) ax.set_ylabel(r'\#occurrences') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) # ax.spines['bottom'].set_visible(False) ax.tick_params(axis='x', which='both', bottom=False, top=False) ax.legend(bbox_to_anchor=(1, .5), loc='center left') fig.tight_layout() # fig.patch.set_visible(False) fig.savefig(os.path.join(os.environ['PGFPATH'], '{0}.pgf'.format(self_name)), format='pgf') fig.savefig(os.path.join(directory, '{0}.pdf'.format(self_name)), format='pdf') return
def main(out_directory=os.getcwd(), param_list=None): logger = new_logger('main', directory=out_directory) logger.info('Created logger successfully') for module_name in ['scipy', 'numpy', 'sklearn', 'pyrcn']: if module_name not in sys.modules: logger.error('Module {0} was not loaded'.format(module_name)) else: logger.info('Module {0} loaded'.format(module_name)) from pyrcn.extreme_learning_machine.tests import test_elm test_elm.test_iris_ensemble_iterative_regression() logger.info('Test run fished') return
def main(out_path=os.path.join(os.getcwd(), 'preprocessing-mnist'), function_name='labels'): if not os.path.exists(out_path): try: os.makedirs(out_path) except OSError as error: print(error) # quick and dirty # directory = os.path.join(os.getcwd(), 'preprocessing-mnist') directory = out_path logger = new_logger('main') logger.info('{0} called, entering main'.format(__file__)) runtime = [time.time()] # fetch data X, y = get_mnist() runtime.append(time.time()) logger.info('fetch: {0} s'.format(np.diff(runtime[-2:]))) logger.info('X.shape = {0}, y.shape = {1}'.format(X.shape, y.shape)) function_dict = { 'labels': plot_labels, 'plot_pooling': plot_pooling, 'plot_poster': plot_poster, 'histogram': plot_historgram, 'var': plot_var, 'normalized': plot_normalized, 'variance_mean': plot_variance_mean, 'image_min_var': plot_image_min_var, 'plot_pca': plot_pca, 'plot_covariance': plot_covariance, 'plot_imbalance': plot_imbalance, 'plot_img_cluster': plot_img_cluster, } if function_name in function_dict: function_dict[function_name](directory) else: logger.warning('no function {0} found'.format(function_name)) logger.info('{0} finished, return from main'.format(__file__))
def main(directory, params): # workdir if not os.path.isdir(directory): try: os.mkdir(directory) except PermissionError as e: print('mkdir failed due to missing privileges: {0}'.format(e)) exit(1) # subfolder for results file_dir = os.path.join(directory, 'mnist-elm') if not os.path.isdir(file_dir): os.mkdir(file_dir) logger = new_logger('main', directory=file_dir) logger.info('Started main with directory={0} and params={1}'.format( directory, params)) # register parameters experiment_names = { 'train_kmeans': train_kmeans, 'elm_hyperparameters': elm_hyperparameters, 'elm_basic': elm_basic, 'elm_pca': elm_pca, 'elm_preprocessed': elm_preprocessed, 'elm_random_state': elm_random_state, 'elm_hidden_layer_size': elm_hidden_layer_size, 'elm_coates': elm_coates, 'elm_coates_stacked': elm_coates_stacked, 'silhouette_n_clusters': silhouette_n_clusters, 'silhouette_subset': silhouette_subset, 'silhouette_kcluster': silhouette_kcluster, 'silhouette_features': silhouette_features } # run specified programs for param in params: if param in experiment_names: experiment_names[param](file_dir) else: logger.warning('Parameter {0} invalid/not found.'.format(param))
def elm_coates(directory): self_name = 'elm_coates' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) filepath_label_encoder = os.path.join( directory, 'label_encoder_{0}.pickle'.format(self_name)) # save label_encoder try: with open(filepath_label_encoder, 'wb') as f: pickle.dump(label_encoder, f) except Exception as e: logger.error('Unexpected error: {0}'.format(e)) exit(1) # scale X so X in [0, 1] X /= 255. X_train, X_test, y_train, y_test = (X[:train_size, ...], X[train_size:], y_encoded[:train_size], y_encoded[train_size:]) csv_filepath = os.path.join(directory, '{0}.csv'.format(self_name)) # read input matrices from files list_filepaths = [] for filepath in glob.glob( os.path.join(directory, '*pca*+kmeans*_matrix.npy')): logger.info('matrix file found: {0}'.format(filepath)) list_filepaths.append(filepath) filename = os.path.splitext(os.path.basename(filepath))[0] est_filepath = os.path.join(directory, 'est_coates-{0}.pickle'.format(filename)) pred_filpath = os.path.join( directory, 'est_coates-{0}-predicted.npz'.format(filename)) # only if files do not exist yet if (not os.path.isfile(csv_filepath) or not os.path.isfile(est_filepath) or not os.path.isfile(pred_filpath)): # setup estimator estimator = ELMClassifier( input_to_node=PredefinedWeightsInputToNode( predefined_input_weights=np.load(filepath), input_scaling=1.0, bias_scaling=0.0, input_activation='relu', random_state=42), chunk_size=1000) logger.info('Estimator params: {0}'.format( estimator.get_params().keys())) # !run time_start = time.time() estimator.fit(X_train, y_train) time_fitted = time.time() y_pred = estimator.predict(X_test) time_predicted = time.time() # !run # results dict_results = estimator.get_params() dict_results.update({ 'filename': filename, 'fit_time': time_fitted - time_start, 'score_time': time_predicted - time_fitted, 'score': accuracy_score(y_test, y_pred) }) # drop data dict_results.pop('input_to_nodes__predefined_input_weights') dict_results.pop('input_to_nodes') dict_results.pop('regressor') logger.info('fitted time {1}, score on test set: {0}'.format( dict_results['score'], dict_results['fit_time'])) # save estimator try: with open(est_filepath, 'wb') as f: pickle.dump(estimator, f) except Exception as e: logger.error('Unexpected error: {0}'.format(e)) exit(1) # save results try: if not os.path.isfile(csv_filepath): with open(csv_filepath, 'a') as f: f.write(','.join(dict_results.keys())) f.write('\n') f.write(','.join( [str(item) for item in dict_results.values()])) f.write('\n') else: with open(csv_filepath, 'a') as f: f.write(','.join( [str(item) for item in dict_results.values()])) f.write('\n') except PermissionError as e: print('Missing privileges: {0}'.format(e)) # save prediction np.savez_compressed(pred_filpath, X_test=X_test, y_test=label_encoder.inverse_transform(y_test), y_pred=label_encoder.inverse_transform(y_pred)) if not list_filepaths: logger.warning('no input weights matrices found') return
def dataset_imbalance(directory, *args, **kwargs): self_name = 'dataset_imbalance' logger = new_logger(self_name, directory) logger.info('Entering {0}'.format(self_name)) list_dict_datasets = [{ 'name': 'abalone19', 'id': 41357 }, { 'name': 'abalone', 'id': 1557 }, { 'name': 'mnist_784', 'id': 554 }, { 'name': 'iris', 'id': 61 }] for dict_dataset in list_dict_datasets: filepath = os.path.join(directory, '{0}.npz'.format(dict_dataset['name'])) if os.path.isfile(filepath): logger.info('Loading {0}'.format(filepath)) npzfile = np.load(filepath, allow_pickle=True) X, y = npzfile['X'], npzfile['y'] else: logger.info('Fetching {0}'.format(dict_dataset['name'])) try: frame = fetch_openml(data_id=dict_dataset['id'], as_frame=True) X, y = frame['data'], frame['target'] np.savez(filepath, X=X, y=y) except Exception as e: logger.warning( 'Failed to load and save {0}, due to error {1}'.format( dict_dataset['name'], e)) continue label_encoder = LabelEncoder().fit(y) labels, label_frequency = np.unique(label_encoder.transform(y), return_counts=True) ir = np.min(label_frequency) / np.max(label_frequency) entropy = scipy.stats.entropy(label_frequency, base=2) max_possible_entropy = scipy.stats.entropy(np.ones( label_frequency.shape), base=2) dict_dataset.update({ 'filepath': filepath, 'labels': label_encoder.classes_, 'labels_nbr': labels, 'label_frequency': label_frequency, 'imbalance_ratio': ir, 'entropy': entropy, 'max_possible_entropy': max_possible_entropy, 'entropy_ratio': entropy / max_possible_entropy, 'features': labels.size, }) filepath = os.path.join(directory, '{0}.csv'.format(self_name)) with open(filepath, 'w', newline='') as f: dict_writer = csv.DictWriter(f, list_dict_datasets[0].keys()) dict_writer.writeheader() dict_writer.writerows(list_dict_datasets) return
def silhouette_subset(directory, *args, **kwargs): logger = new_logger('plot_silhouette_subset', directory) logger.info('entering') X, y = get_mnist(directory) X /= 255. pca = PCA(n_components=50, whiten=False, random_state=42) # preprocessing X_pca = pca.fit_transform(X) # define subset sizes subset_sizes = [250, 500, 1000, 2000, 4000, 8000, 16000, 32000, 60000] # number of centroids k_list = [20] dict_results = { 'subset_size': [], 'k': [], 'n_init': [], 'silhouette_raninit': [], 'silhouette_preinit': [], 'fittime_raninit': [], 'fittime_preinit': [], 'scoretime_raninit': [], 'scoretime_preinit': [] } for k in k_list: # preinit # initial training set X_train, X_test, y_train, y_test = train_test_split( X_pca, y, random_state=42, train_size=subset_sizes[0], shuffle=True, stratify=y) clusterer_init = KMeans(n_clusters=k, random_state=42, init='k-means++', n_init=10).fit(X_train) # random inits clusterer = KMeans(n_clusters=k, n_init=10, random_state=42) for subset_size in subset_sizes: # split on subset size dict_results['subset_size'].append(subset_size) X_train, X_test, y_train, y_test = train_test_split( X_pca, y, random_state=42, train_size=subset_size, shuffle=True, stratify=y) # train preinit t = time.time() clusterer_init = KMeans(n_clusters=k, random_state=42, n_init=1, init=clusterer_init.cluster_centers_) clusterer_init.fit_predict(X_train) dict_results['fittime_preinit'].append(time.time() - t) # score preinit t = time.time() dict_results['silhouette_preinit'].append( silhouette_score(X_train, clusterer_init.predict(X_train), metric='euclidean', random_state=42)) dict_results['scoretime_preinit'].append(time.time() - t) # train randinit t = time.time() clusterer.fit(X_train) dict_results['fittime_raninit'].append(time.time() - t) # score raninit t = time.time() dict_results['silhouette_raninit'].append( silhouette_score(X_train, clusterer.predict(X_train), metric='euclidean', random_state=42)) dict_results['scoretime_raninit'].append(time.time() - t) # store results dict_results['k'].append(k) dict_results['n_init'].append(clusterer.n_init) logger.info('silhouette (preinit) at subset size {1}: {0}'.format( dict_results['silhouette_preinit'][-1], dict_results['subset_size'][-1])) # save results to csv with open(os.path.join(directory, 'silhouette_kmeans_subset_size.csv'), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def elm_hyperparameters(directory): self_name = 'elm_hyperparameters' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) X = X / 255. label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # X_train, X_test, y_train, y_test = train_test_split( # X, y_encoded, train_size=train_size, random_state=42, shuffle=True) X_train, _, y_train, _ = (X[:train_size, :], X[train_size:, :], y_encoded[:train_size], y_encoded[train_size:]) param_grid = { 'hidden_layer_size': [2000], 'input_scaling': np.logspace(start=-2, stop=2, base=10, num=7), 'bias_scaling': np.logspace(start=-2, stop=2, base=10, num=7), 'input_activation': ['tanh'], 'alpha': [1e-5], 'random_state': [42] } estimator = ELMClassifier(regressor=Ridge()) cv = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1, scoring='accuracy') cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) cv_results = cv.cv_results_ del cv_results['params'] with open(os.path.join(directory, '{0}_scaling.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') param_grid = { 'hidden_layer_size': [500, 1000, 2000, 4000], 'input_scaling': [cv.best_params_['input_scaling']], 'bias_scaling': [cv.best_params_['bias_scaling']], 'input_activation': ['tanh', 'relu', 'bounded_relu', 'logistic', 'identity'], 'alpha': [1e-5], 'random_state': [42] } cv = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1, scoring='accuracy') cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) cv_results = cv.cv_results_ del cv_results['params'] with open(os.path.join(directory, '{0}_size.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') param_grid = { 'hidden_layer_size': [cv.best_params_['hidden_layer_size']], 'input_scaling': [cv.best_params_['input_scaling']], 'bias_scaling': [cv.best_params_['bias_scaling']], 'input_activation': [cv.best_params_['input_activation']], 'alpha': [.00001, .001, .1], 'random_state': [42] } cv = GridSearchCV(estimator, param_grid, cv=5, n_jobs=1, scoring='accuracy') cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) cv_results = cv.cv_results_ del cv_results['params'] with open(os.path.join(directory, '{0}_alpha.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n')
def plot_historgram(directory, *args, **kwargs): logger = new_logger('plot_historgram', directory) logger.info('entering') X, y = get_mnist(directory) fig, axs = plt.subplots(1, 2, figsize=(5, 2), gridspec_kw={'width_ratios': [1, 1.7]}) example = np.zeros((28, 28, 3)) example[..., 0] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # red example[..., 1] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # green example[..., 2] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # blue idx_fringe = (25, 17) idx_center = (13, 12) example[idx_center[0], idx_center[1], :] = tud_colors['lightblue'][:-1] example[idx_fringe[0], idx_fringe[1], :] = tud_colors['orange'][:-1] bins = np.array(range(0, 287, 32)).astype(int) hist_fringe, bin_edges = np.histogram(X[:, idx_fringe[0] * 28 + idx_fringe[1]], bins=bins) hist_center, bin_edges = np.histogram(X[:, idx_center[0] * 28 + idx_center[1]], bins=bins) logger.info('validation sum hist_fringe: {0}, sum hist_center: {1}'.format( np.sum(hist_fringe / 1000), np.sum(hist_center / 1000))) axs[0].imshow(example, interpolation='none') axs[0].set_xticks([0, 27]) axs[0].set_xticklabels([0, 27]) axs[0].set_yticks([0, 27]) axs[0].set_yticklabels([0, 27]) axs[1].bar(bins[1:] - 32, height=hist_fringe / 1000, width=16, color=tud_colors['orange'], label='fringe', align='edge') axs[1].bar(bins[1:] - 16, height=hist_center / 1000, width=16, color=tud_colors['lightblue'], label='center', align='edge') axs[1].tick_params(axis='x', labelrotation=90) # axs[1].hist([], bins=range(0, 255, 32), color=[tud_colors['orange'], # tud_colors['lightblue']], # align='left') axs[1].set_xticks(bins) # axs[1].legend(bbox_to_anchor=(0, 1, 1, 0), loc="lower left", # mode="expand", ncol=2) axs[1].legend(bbox_to_anchor=(1.0, .5), loc="center left") # fig.suptitle('Feature distribution in MNIST picture') axs[1].set_xlabel('value bins') axs[1].set_ylabel('probability') fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-pixel-histogram.pdf')) fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pixel-histogram.pgf'), format='pgf') # plt.show() return
def picture_gradient(directory): self_name = 'picture_gradient' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # scale X so X in [0, 1] X /= 255. # reshape X X_images = X.reshape((X.shape[0], 28, 28)) list_kernels = [{ 'name': 'laplace', 'kernel': np.array([[-1., -1., -1.], [-1., 8, -1.], [-1., -1., -1.]]) }, { 'name': 'mexicanhat', 'kernel': np.array([[0., 0., -1., 0., 0.], [0., -1., -2., -1., 0.], [-1., -2., 16, -2., -1.], [0., -1., -2., -1., 0.], [0., 0., -1., 0., 0.]]) }, { 'name': 'v_prewitt', 'kernel': np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]]) }, { 'name': 'h_prewitt', 'kernel': np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]]).T }, { 'name': 'v_sobel', 'kernel': np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]]) }, { 'name': 'h_sobel', 'kernel': np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]]).T }] example_image_idx = 5 fig, axs = plt.subplots(1, 4, figsize=(6, 2)) axs[0].imshow(X_images[example_image_idx], cmap=plt.cm.gray_r, interpolation='none') axs[0].set_title('no filter') axs[1].imshow(convolve2d(X_images[example_image_idx], list_kernels[0]['kernel'], mode='same'), cmap=plt.cm.gray_r, interpolation='none') axs[1].set_title('laplace') axs[2].imshow(convolve2d(X_images[example_image_idx], list_kernels[2]['kernel'], mode='same'), cmap=plt.cm.gray_r, interpolation='none') axs[2].set_title('vertical\nprewitt') axs[3].imshow(convolve2d(X_images[example_image_idx], list_kernels[5]['kernel'], mode='same'), cmap=plt.cm.gray_r, interpolation='none') axs[3].set_title('horizontal\nsobel') for ax in axs: ax.set_xticks([0, 27]) ax.set_xticklabels([0, 27]) ax.set_yticks([0, 27]) ax.set_yticklabels([0, 27]) fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-image-filters.pdf'), format='pdf') fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-image-filters.pgf'), format='pgf')
def test_new_logger() -> None: directory = os.getcwd() logger = new_logger(name='test_logger', directory=directory) logger.info('Test') assert os.path.isfile(os.path.join(directory, 'test_logger.log'))
def silhouette_n_clusters(directory, *args, **kwargs): logger = new_logger('plot_silhouette_n_clusters', directory) logger.info('entering') X, y = get_mnist(directory) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) scaler = StandardScaler().fit(X) X /= 255. pca = PCA(n_components=50, whiten=False, random_state=42).fit(X) min_var = 3088.6875 # reduce train size # X = X[:10000, ...] X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=10000, random_state=42) # variance threshold X_var_threshold = X_train[..., scaler.var_ > min_var] # pca X_pca = pca.transform(X_train) # n_clusters k = [ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500, 1000, 2000, 4000 ] # n_init n_init = 10 dict_results = { 'n_clusters': [], 'n_init': [], 'variance_threshold': [], 'pca_n_components': [], 'pca_explained_variance': [], 'pca_explained_variance_ratio': [], 'silhouette_original': [], 'silhouette_variance_threshold': [], 'silhouette_pca': [], 'fittime_original': [], 'fittime_variance_threshold': [], 'fittime_pca': [], 'inertia_original': [], 'inertia_variance_threshold': [], 'inertia_pca': [], 'n_iter_original': [], 'n_iter_variance_threshold': [], 'n_iter_pca': [] } for n_clusters in k: dict_results['n_clusters'].append(n_clusters) dict_results['n_init'].append(n_init) dict_results['variance_threshold'].append(min_var) dict_results['pca_n_components'].append(pca.n_components_) dict_results['pca_explained_variance'].append( np.sum(pca.explained_variance_)) dict_results['pca_explained_variance_ratio'].append( np.sum(pca.explained_variance_ratio_)) clusterer = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=n_init, random_state=42) # original t = time.time() clusterer.fit(X_train) dict_results['fittime_original'].append(time.time() - t) dict_results['inertia_original'].append(clusterer.inertia_) dict_results['n_iter_original'].append(clusterer.n_iter_) dict_results['silhouette_original'].append( silhouette_score(X_train, clusterer.predict(X_train), metric='euclidean', random_state=42)) np.save('./cluster_critical.npy', clusterer.cluster_centers_) # var threshold t = time.time() clusterer.fit(X_var_threshold) dict_results['fittime_variance_threshold'].append(time.time() - t) dict_results['inertia_variance_threshold'].append(clusterer.inertia_) dict_results['n_iter_variance_threshold'].append(clusterer.n_iter_) dict_results['silhouette_variance_threshold'].append( silhouette_score(X_train, clusterer.predict(X_var_threshold), metric='euclidean', random_state=42)) # pca t = time.time() clusterer.fit(X_pca) dict_results['fittime_pca'].append(time.time() - t) dict_results['inertia_pca'].append(clusterer.inertia_) dict_results['n_iter_pca'].append(clusterer.n_iter_) dict_results['silhouette_pca'].append( silhouette_score(X_train, clusterer.predict(X_pca), metric='euclidean', random_state=42)) logger.info('n_clusters = {0}, pca kmeans score: {1}'.format( n_clusters, dict_results['silhouette_pca'][-1])) logger.info('n_clusters = {0}'.format(n_clusters)) # save results to csv with open(os.path.join(directory, 'silhouette_n_clusters.csv'), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def elm_coates_stacked(directory): self_name = 'elm_coates_stacked' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # scale X so X in [0, 1] X /= 255. # setup parameter grid param_grid = { 'chunk_size': [10000], 'input_scaling': np.logspace(start=-3, stop=1, base=10, num=3), 'bias_scaling': [0.], # np.logspace(start=-3, stop=1, base=10, num=6), 'input_activation': ['relu'], 'alpha': [1e-5], 'random_state': [42] } # read input matrices from files list_filepaths = [] predefined_input_weights = np.empty((784, 0)) for filepath in glob.glob(os.path.join(directory, '*kmeans1*matrix.npy')): logger.info('matrix file found: {0}'.format(filepath)) list_filepaths.append(filepath) predefined_input_weights = np.append(predefined_input_weights, np.load(filepath), axis=1) # setup estimator estimator = ELMClassifier( PredefinedWeightsInputToNode( predefined_input_weights=predefined_input_weights), IncrementalRegression()) logger.info('Estimator params: {0}'.format(estimator.get_params().keys())) # return # setup grid search cv = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='accuracy', n_jobs=1, verbose=1, cv=[(np.arange(0, train_size), np.arange(train_size, 70000))]) # run! cv.fit(X, y_encoded) cv_best_params = cv.best_params_ del cv_best_params['input_to_nodes__predefined_input_weights'] # refine best params logger.info('best parameters: {0} (score: {1})'.format( cv_best_params, cv.best_score_)) # refine results cv_results = cv.cv_results_ del cv_results['params'] del cv_results['param_input_to_nodes__predefined_input_weights'] # save results try: with open(os.path.join(directory, '{0}.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') except PermissionError as e: print('Missing privileges: {0}'.format(e)) if not list_filepaths: logger.warning('no input weights matrices found') return
def silhouette_features(directory, *args, **kwargs): logger = new_logger('plot_silhouette_features', directory) logger.info('entering') X, y = get_mnist(directory) X /= 255. X = X[:10000, ...] scaler = StandardScaler().fit(X) pca = PCA(whiten=False, random_state=42).fit(X) X_pca = pca.transform(X) # sort scaler variances variance_indices = np.argsort(scaler.var_)[::-1] n_features_list = [ 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 784 ] rs = np.random.RandomState(42) k = 20 dict_results = { 'nfeatures': [], 'fittime_random': [], 'fittime_maxvar': [], 'fittime_pca': [], 'silhouette_random': [], 'silhouette_maxvar': [], 'silhouette_pca': [], 'explainvar_random': [], 'explainvar_maxvar': [], 'explainvar_pca': [], 'explvarrat_random': [], 'explvarrat_maxvar': [], 'explvarrat_pca': [], 'n_clusters': [], } for n_features in n_features_list: clusterer = KMeans(n_clusters=k, random_state=42) dict_results['nfeatures'].append(n_features) dict_results['n_clusters'].append(clusterer.n_clusters) indices = rs.choice(X.shape[1], size=n_features) t = time.time() pred = clusterer.fit_predict(X[:, indices]) dict_results['fittime_random'].append(time.time() - t) dict_results['silhouette_random'].append( silhouette_score(X, pred, metric='euclidean', random_state=42)) dict_results['explainvar_random'].append(np.sum(scaler.var_[indices])) dict_results['explvarrat_random'].append( np.sum(scaler.var_[indices]) / np.sum(scaler.var_)) t = time.time() indices = variance_indices[:n_features] pred = clusterer.fit_predict(X[:, indices]) dict_results['fittime_maxvar'].append(time.time() - t) dict_results['silhouette_maxvar'].append( silhouette_score(X, pred, metric='euclidean', random_state=42)) dict_results['explainvar_maxvar'].append(np.sum(scaler.var_[indices])) dict_results['explvarrat_maxvar'].append( np.sum(scaler.var_[indices]) / np.sum(scaler.var_)) t = time.time() pred = clusterer.fit_predict(X_pca[:, :n_features]) dict_results['fittime_pca'].append(time.time() - t) dict_results['silhouette_pca'].append( silhouette_score(X, pred, metric='euclidean', random_state=42)) dict_results['explainvar_pca'].append( np.sum(pca.explained_variance_[:n_features])) dict_results['explvarrat_pca'].append( np.sum(pca.explained_variance_ratio_[:n_features])) logger.info('pca silhouette at n_features={1:.0f}: {0}'.format( dict_results['silhouette_pca'][-1], n_features)) # save results to csv with open( os.path.join(directory, 'silhouette_kmeans{0:.0f}_features.csv'.format(k)), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def elm_hidden_layer_size(directory): self_name = 'elm_hidden_layer_size' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) # encode y label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # scale X X /= 255. # split train test X_train, X_test, y_train, y_test = (X[:train_size, :], X[train_size:, :], y_encoded[:train_size], y_encoded[train_size:]) # fan-out from paper fan_out = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20] # prepare parameter grids param_grid_basic = { 'hidden_layer_size': 0, 'input_scaling': 1., 'bias_scaling': 0., 'activation': 'relu', 'chunk_size': 1000, 'alpha': 1e-5, 'random_state': 42 } param_grid_pca = { 'hidden_layer_size': 0, 'input_scaling': 1., 'bias_scaling': 0., 'activation': 'relu', 'chunk_size': 1000, 'alpha': 1e-5, 'random_state': 42 } # setup estimator estimator = ELMClassifier() # basic try: # initialize filepath csv_filepath = os.path.join(directory, '{0}_basic.csv'.format(self_name)) # initialize param dict param_dict_job = estimator.get_params().copy() param_dict_job.update(param_grid_basic) # initialize results dict results_dict_job = param_dict_job.copy() # add dummy results results_dict_job.update({'time_fit': 0, 'time_pred': 0, 'score': 0}) # write header with open(csv_filepath, 'w') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writeheader() for hls in 784 * np.array(fan_out): param_dict_job.update({'hidden_layer_size': hls}) estimator.set_params(**param_dict_job) # run! time_start = time.time() estimator.fit(X_train, y_train) time_fit = time.time() y_pred = estimator.predict(X_test) time_pred = time.time() # run end! results_dict_job.update(estimator.get_params()) results_dict_job.update({ 'time_fit': time_fit - time_start, 'time_pred': time_pred - time_fit, 'score': accuracy_score(y_test, y_pred) }) logger.info('hidden_layer_size: {0}, score: {1}'.format( hls, results_dict_job['score'])) with open(csv_filepath, 'a') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writerow(results_dict_job) del estimator.input_to_node._hidden_layer_state with open( os.path.join(directory, 'elmc_hls{0}_basic.pickle'.format(hls)), 'wb') as f: pickle.dump(estimator, f) except MemoryError as e: logger.error('Memory error: {0}'.format(e)) pass except PermissionError as e: logger.error('Missing privileges: {0}'.format(e)) pass # preprocessing pca try: # initialize filepath csv_filepath = os.path.join(directory, '{0}_pca.csv'.format(self_name)) # preprocessing pca50 = PCA(n_components=50).fit(X_train) X_train_pca50, X_test_pca50 = (pca50.transform(X_train), pca50.transform(X_test)) pca100 = PCA(n_components=100).fit(X_train) X_train_pca100, X_test_pca100 = (pca100.transform(X_train), pca100.transform(X_test)) list_dict_pca = [{ 'n_components': 50, 'X_train': X_train_pca50, 'X_test': X_test_pca50 }, { 'n_components': 100, 'X_train': X_train_pca100, 'X_test': X_test_pca100 }] logger.info('Preprocessing successful!') # initialize param dict param_dict_job = estimator.get_params().copy() param_dict_job.update(param_grid_pca) # initialize results dict results_dict_job = param_dict_job.copy() # add dummy results results_dict_job.update({ 'time_fit': 0, 'time_pred': 0, 'score': 0, 'pca_n_components': 0 }) # write header with open(csv_filepath, 'w') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writeheader() for dict_pca in list_dict_pca: results_dict_job.update( {'pca_n_components': dict_pca['n_components']}) for hls in np.concatenate( (100 * np.array(fan_out), 784 * np.array(fan_out)), axis=0): param_dict_job.update({'hidden_layer_size': hls}) estimator.set_params(**param_dict_job) # run! time_start = time.time() estimator.fit(dict_pca['X_train'], y_train) time_fit = time.time() y_pred = estimator.predict(dict_pca['X_test']) time_pred = time.time() # run end! results_dict_job.update(estimator.get_params()) results_dict_job.update({ 'time_fit': time_fit - time_start, 'time_pred': time_pred - time_fit, 'score': accuracy_score(y_test, y_pred) }) logger.info( 'n_components: {2}, hidden_layer_size: {0}, score:' ' {1}'.format(hls, results_dict_job['score'], results_dict_job['pca_n_components'])) with open(csv_filepath, 'a') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writerow(results_dict_job) with open( os.path.join( directory, 'elmc_hls{0}_pca{1}.pickle'.format( hls, results_dict_job['pca_n_components'])), 'wb') as f: pickle.dump(estimator, f) except MemoryError as e: logger.error('Memory error: {0}'.format(e)) pass except PermissionError as e: logger.error('Missing privileges: {0}'.format(e)) pass
def elm_preprocessed(directory): self_name = 'elm_preprocessed' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # preprocessing X /= 255. pca = PCA(n_components=50).fit(X) X_preprocessed = pca.transform(X) logger.info('{0} features remaining after preprocessing.'.format( X_preprocessed.shape[1])) # train test split X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_encoded, train_size=train_size, random_state=42) # prepare parameter grid param_grid = [{ 'hidden_layer_size': [500, 2000], 'input_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'bias_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'input_activation': ['relu'], 'alpha': [1e-5], 'random_state': [42] }, { 'hidden_layer_size': [2000], 'input_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'bias_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'input_activation': ['tanh'], 'alpha': [1e-5], 'random_state': [42] }] # setup estimator estimator = ELMClassifier(regressor=Ridge()) # setup grid search cv = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='accuracy', n_jobs=1, verbose=2, refit=False, cv=StratifiedShuffleSplit(n_splits=1, test_size=1 / 7, random_state=42)) # run! cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) # refine results cv_results = cv.cv_results_ del cv_results['params'] # save results try: with open(os.path.join(directory, 'elm_preprocessed.csv'), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') except PermissionError as e: print('Missing privileges: {0}'.format(e))
def train_kmeans(directory): self_name = 'train_kmeans' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) # scale X, so $X \in [0, 1]$ X /= 255. list_n_components = [50] # [50, 100] # [20, 50, 100, 200, 500, 1000, 2000, 4000, 8000, 16000] list_n_clusters = [200] for n_components in list_n_components: pca = PCA(n_components=n_components, random_state=42).fit(X) X_pca = pca.transform(X) logger.info('pca{0}: explained variance ratio = {1}'.format( n_components, np.sum(pca.explained_variance_ratio_))) for n_clusters in list_n_clusters: # minibatch kmeans kmeans_basename = 'minibatch-pca{0}+kmeans{1}'.format( n_components, n_clusters) # only if file does not exist if not os.path.isfile( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename))): clusterer = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', random_state=42, batch_size=5000, n_init=5).fit(X_pca) np.save( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename)), np.dot(pca.components_.T, clusterer.cluster_centers_.T)) # assemble pipeline p = make_pipeline(pca, clusterer) with open( os.path.join( directory, '{0}_pipeline.pickle'.format(kmeans_basename)), 'wb') as f: pickle.dump(p, f) logger.info( 'successfuly trained MiniBatchKMeans' 'and saved to npy/pickle {0}'.format(kmeans_basename)) # original kmeans kmeans_basename = 'original-pca{0}+kmeans{1}'.format( n_components, n_clusters) if n_clusters < 2000 and not os.path.isfile( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename))): clusterer = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=5).fit(X_pca) np.save( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename)), np.dot(pca.components_.T, clusterer.cluster_centers_.T)) # assemble pipeline p = make_pipeline(pca, clusterer) with open( os.path.join( directory, '{0}_pipeline.pickle'.format(kmeans_basename)), 'wb') as f: pickle.dump(p, f) logger.info('successfuly trained KMeans and saved to' 'npy/pickle {0}'.format(kmeans_basename))
def elm_pca(directory): self_name = 'elm_pca' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) # scale X X /= 255. # split train test X_train, X_test, y_train, y_test = train_test_split(X[:train_size], y[:train_size], train_size=50000, random_state=42) # prepare parameter grids param_grid_basic = { 'hidden_layer_size': 2000, 'input_scaling': 1., 'bias_scaling': 0., 'input_activation': 'relu', 'alpha': 1e-5, 'random_state': 42 } # setup estimator estimator = ELMClassifier(regressor=Ridge()) # initialize filepath filepath = os.path.join(directory, '{0}_basic.csv'.format(self_name)) # initialize param dict param_dict_job = estimator.get_params().copy() param_dict_job.update(param_grid_basic) # initialize results dict results_dict_job = param_dict_job.copy() # add dummy results results_dict_job.update({ 'time_fit': 0, 'time_pred': 0, 'score': 0, 'pca_n_components': 0 }) # preprocessing pca try: # write header with open(filepath, 'w') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writeheader() for pca_n_components in [10, 20, 50, 100, 200, 500, 784]: results_dict_job.update({'pca_n_components': pca_n_components}) estimator.set_params(**param_dict_job) # preprocessing pca = PCA(n_components=pca_n_components).fit(X_train) X_train_pca, X_test_pca = \ pca.transform(X_train), pca.transform(X_test) # run! time_start = time.time() estimator.fit(X_train_pca, y_train) time_fit = time.time() y_pred = estimator.predict(X_test_pca) time_pred = time.time() # run end! results_dict_job.update({ 'time_fit': time_fit - time_start, 'time_pred': time_pred - time_fit, 'score': accuracy_score(y_test, y_pred) }) logger.info('pca.n_components_: {0}, score: {1}'.format( pca_n_components, results_dict_job['score'])) with open(filepath, 'a') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writerow(results_dict_job) except MemoryError as e: logger.error('Memory error: {0}'.format(e)) except PermissionError as e: logger.error('Missing privileges: {0}'.format(e)) except Exception as e: logger.error('Unexpected exception: {0}'.format(e))