def run_workload_characterization(metric_data): # Performs workload characterization on the metric_data and returns # a set of pruned metrics. # # Parameters: # metric_data is a dictionary of the form: # - 'data': 2D numpy matrix of metric data (results x metrics) # - 'rowlabels': a list of identifiers for the rows in the matrix # - 'columnlabels': a list of the metric names corresponding to # the columns in the data matrix matrix = metric_data['data'] columnlabels = metric_data['columnlabels'] # Remove any constant columns nonconst_matrix = [] nonconst_columnlabels = [] for col, cl in zip(matrix.T, columnlabels): if np.any(col != col[0]): nonconst_matrix.append(col.reshape(-1, 1)) nonconst_columnlabels.append(cl) assert len(nonconst_matrix) > 0, "Need more data to train the model" nonconst_matrix = np.hstack(nonconst_matrix) n_rows, n_cols = nonconst_matrix.shape # Bin each column (metric) in the matrix by its decile binner = Bin(bin_start=1, axis=0) binned_matrix = binner.fit_transform(nonconst_matrix) # Shuffle the matrix rows shuffle_indices = get_shuffle_indices(n_rows) shuffled_matrix = binned_matrix[shuffle_indices, :] # Fit factor analysis model fa_model = FactorAnalysis() # For now we use 5 latent variables fa_model.fit(shuffled_matrix, nonconst_columnlabels, n_components=N_COMPONENTS) # Components: metrics * factors components = fa_model.components_.T.copy() # Run Kmeans for # clusters k in range(1, num_nonduplicate_metrics - 1) # K should be much smaller than n_cols in detK, For now max_cluster <= 20 kmeans_models = KMeansClusters() kmeans_models.fit(components, min_cluster=1, max_cluster=min(n_cols - 1, 20), sample_labels=nonconst_columnlabels, estimator_params={'n_init': 50}) # Compute optimal # clusters, k, using gap statistics gapk = create_kselection_model("gap-statistic") gapk.fit(components, kmeans_models.cluster_map_) # Get pruned metrics, cloest samples of each cluster center pruned_metrics = kmeans_models.cluster_map_[ gapk.optimal_num_clusters_].get_closest_samples() # Return pruned metrics return pruned_metrics
def run_PCA(X): """Execute Principal Component Analysis. Arg X : X data with column, row label. (Matrix) Return components : Result of pca in variance descending order. (Numpy array, [# of features, # of components]) components_columnlabels : Labels for each componenets. (Numpy array, [# of features,]) """ #-------------- # Execute PCA. #-------------- pca = PCA() pca.fit(X.data) #------------------------------ # Determine number of factors. #------------------------------ # Only nonzero components should be considered. pca_mask = np.sum(pca.components_ != 0.0, axis=1) > 0.0 # Select number of components which can explain REQUIRED_VARIANCE_EXPLAINED percent of variance. variances = pca.explained_variance_ratio_ variances_explained_percent = np.array([np.sum(variances[:i+1]) * 100 for i in range(variances.shape[0])]) component_cutoff = np.count_nonzero(variances_explained_percent < REQUIRED_VARIANCE_EXPLAINED) + 1 component_cutoff = min(component_cutoff, 10) #print variances. print "component cutoff: {}".format(component_cutoff) for i,var in enumerate(variances): print i, var, np.sum(variances[:i+1]), np.sum(variances[:i+1]) #---------------- # Postprecessing #---------------- # Standardization components = np.transpose(pca.components_[:component_cutoff]).copy() print "components shape: {}".format(components.shape) standardizer = StandardScaler() components = standardizer.fit_transform(components) # Shuffle factor analysis X rows. (metrics x factors) metric_shuffle_indices = get_shuffle_indices(components.shape[0]) components = components[metric_shuffle_indices] # Make labels for each column. components_columnlabels = X.columnlabels[metric_shuffle_indices] return components, components_columnlabels
def run_factor_analysis(X): """Execute factor analysis. Arg X : X data with column, row label. (Matrix) Return components : Result of factor analysis in variance descending order. (Numpy array, [# of features, # of components]) components_columnlabels : Labels for each componenets. (Numpy array, [# of features,]) """ #------------------------- # Execute factor analysis #------------------------- fa = FactorAnalysis() # Feed X.data.T for reduction across feature axis, X.data for reduction across sample axis. fa.fit(X.data) #----------------------------- # Determine number of factors #----------------------------- # Only nonzero components should be considered. fa_mask = np.sum(fa.components_ != 0.0, axis=1) > 0.0 # Calculate each variance(actually sum of absoulute value) and total variance variances = np.sum(np.abs(fa.components_[fa_mask]), axis=1) total_variance = np.sum(variances).squeeze() print "total variance: {}".format(total_variance) # Select number of components which can explain REQUIRED_VARIANCE_EXPLAINED percent of variance. var_exp = np.array([np.sum(variances[:i+1]) / total_variance * 100 for i in range(variances.shape[0])]) factor_cutoff = np.count_nonzero(var_exp < REQUIRED_VARIANCE_EXPLAINED) + 1 factor_cutoff = min(factor_cutoff, 10) print "factor cutoff: {}".format(factor_cutoff) for i,var in enumerate(variances): print i, var, np.sum(variances[:i+1]), np.sum(variances[:i+1]) / total_variance #---------------- # Postprecessing #---------------- # Standardization components = np.transpose(fa.components_[:factor_cutoff]).copy() print "components shape: {}".format(components.shape) standardizer = StandardScaler() components = standardizer.fit_transform(components) # Shuffle factor analysis X rows. (metrics x factors) metric_shuffle_indices = get_shuffle_indices(components.shape[0]) components = components[metric_shuffle_indices] # Make labels for each column. components_columnlabels = X.columnlabels[metric_shuffle_indices] return (components, components_columnlabels)