def test_radviz(pandas=False, outpath=None): """ Runs the radviz visualizer on the dataset. Parameters ---------- pandas : bool Run the pandas version of the function outpath : path or None Save the figure to disk rather than show (if None) """ data = load_data('occupancy') # Load the data features = ['temp', 'humid', 'light', 'co2', 'hratio'] classes = ['unoccupied', 'occupied'] X = data[features].as_matrix() y = data.occupied.as_matrix() if pandas: radviz(data[features + ['occupied']], 'occupied') if outpath: plt.savefig(outpath) else: plt.show() else: visualizer = RadViz( # Instantiate the visualizer classes=classes, features=features) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath=outpath) # Draw/show/poof the data
def RandianViz(self, X, y, number_of_features): if number_of_features is None: features = X.columns.values else: features = X.columns.values[:number_of_features] fig, ax = plt.subplots(1, figsize=(15, 12)) radViz = RadViz(classes=['survived', 'not survived'], features=features) radViz.fit(X, y) radViz.transform(X) radViz.poof()
def feature_analysis(fname="feature_analysis.png"): """ Create figures for feature analysis """ # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Draw RadViz on the left data = load_occupancy(split=False) oz = RadViz(ax=axes[0], classes=["unoccupied", "occupied"]) oz.fit(data.X, data.y) oz.finalize() # Draw Rank2D on the right data = load_concrete(split=False) oz = Rank2D(ax=axes[1]) oz.fit_transform(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def feature_analysis(fname="feature_analysis.png"): """ Create figures for feature analysis """ # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18,6)) # Draw RadViz on the left data = load_occupancy(split=False) oz = RadViz(ax=axes[0], classes=["unoccupied", "occupied"]) oz.fit(data.X, data.y) oz.finalize() # Draw Rank2D on the right data = load_concrete(split=False) oz = Rank2D(ax=axes[1]) oz.fit_transform(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def radviz(ax): from yellowbrick.features import RadViz # Specify the features of interest and the classes of the target features = ["temperature", "relative humidity", "light", "C02", "humidity"] target = "occupancy" classes = ['unoccupied', 'occupied'] # Load the data X, y = load_data('occupancy', cols=features, target=target) # Instantiate and fit the visualizer visualizer = RadViz(ax=ax, classes=classes, features=features) visualizer.title = "RadViz of Features to Predict Room Occupancy" visualizer.fit(X, y) visualizer.transform(X) return visualizer
def plot_Radvis(objectives, ax, name): class_dummy = np.zeros(len(objectives)) visualizer = RadViz(classes=[name], ax=ax, alpha=.75) visualizer.fit(objectives, class_dummy) visualizer.show()
visualizer = Rank2D() visualizer.fit_transform(X) visualizer.poof() # %% visualizer = FeatureCorrelation() visualizer.fit(X, y) visualizer.poof() # %% visualizer = FeatureCorrelation(method='mutual_info-classification') visualizer.fit(X, y) visualizer.poof() # %% visualizer = RadViz(classes=class_names) visualizer.fit(X, y) visualizer.transform(X) visualizer.poof() # %% colors = np.array(['r' if yi else 'b' for yi in y]) visualizer = PCADecomposition(color=colors, proj_features=True) visualizer.fit_transform(X, y) visualizer.poof() visualizer = PCADecomposition(scale=True, color=colors, proj_dim=3, proj_features=True) visualizer.fit_transform(X, y) visualizer.poof()
def radviz(): X, y = load_occupancy() oz = RadViz(ax=newfig()) oz.fit_transform(X, y) savefig(oz, "radviz")
import pandas as pd from yellowbrick.features import RadViz # Load the classification data set data = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") # Specify the features of interest and the classes of the target features = ["temperature", "relative humidity", "light", "C02", "humidity"] classes = ['unoccupied', 'occupied'] # Extract the instances and target X = data[features] y = data.occupancy # Instantiate the visualizer visualizer = RadViz(classes=classes, features=features) visualizer.fit(X, y) visualizer.transform(X) visualizer.poof(outpath="images/radviz.png")
features_ordered.append(a) last_feature = a print(features_ordered) print(len(features)) features = [train_columns[i] for i in features_ordered] print(features) print(len(features)) XT = X[:,numpy.asarray(features_ordered)] print(X.shape, XT.shape, Y.shape) """ numpy.random.seed(1) XT = numpy.arange(1000).reshape((-1, 2)) YT = numpy.random.randint(6, size=len(XT)) * 10 print(XT.shape, XT.dtype, YT.shape, YT.dtype) visualizer = RadViz(classes=[0, 10, 20, 30, 40, 50]) visualizer.fit_transform(XT, YT) visualizer.poof(outpath='viz_feature_radviz.pdf', bbox_inches='tight') plt.close('all') visualizer = ParallelCoordinates(classes=[0, 10, 20, 30, 40, 50], sample=0.05, shuffle=True, fast=True) visualizer.fit_transform(XT, YT) visualizer.poof(outpath='viz_feature_parallel_coords.pdf', bbox_inches='tight') plt.close('all') XT = X YT = Y
def visualize_features(classes, problem_type, curdir, default_features, balance_data, test_size): # make features into label encoder here features, feature_labels, class_labels = get_features( classes, problem_type, default_features, balance_data) # now preprocess features for all the other plots os.chdir(curdir) le = preprocessing.LabelEncoder() le.fit(class_labels) tclass_labels = le.transform(class_labels) # process features to help with clustering se = preprocessing.StandardScaler() t_features = se.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, tclass_labels, test_size=test_size, random_state=42) # print(len(features)) # print(len(feature_labels)) # print(len(class_labels)) # print(class_labels) # GET TRAINING DATA DURING MODELING PROCESS ################################## # get filename # csvfile='' # print(classes) # for i in range(len(classes)): # csvfile=csvfile+classes[i]+'_' # get training and testing data for later # try: # print('loading training files...') # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv') # y_train=X_train['class_'] # X_train.drop(['class_'], axis=1) # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv') # y_test=X_test['class_'] # X_test.drop(['class_'], axis=1) # y_train=le.inverse_transform(y_train) # y_test=le.inverse_transform(y_test) # except: # print('error loading in training files, making new test data') # Visualize each class (quick plot) ################################## visualization_dir = 'visualization_session' try: os.mkdir(visualization_dir) os.chdir(visualization_dir) except: shutil.rmtree(visualization_dir) os.mkdir(visualization_dir) os.chdir(visualization_dir) objects = tuple(set(class_labels)) y_pos = np.arange(len(objects)) performance = list() for i in range(len(objects)): performance.append(class_labels.count(objects[i])) plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.xticks(rotation=90) plt.title('Counts per class') plt.ylabel('Count') plt.xlabel('Class') plt.tight_layout() plt.savefig('classes.png') plt.close() # set current directory curdir = os.getcwd() # ################################## # # CLUSTERING!!! # ################################## ################################## # Manifold type options ################################## ''' "lle" Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures. "ltsa" LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances. "hessian" Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood "modified" Modified LLE applies a regularization parameter to LLE. "isomap" Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance. "mds" MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding. "spectral" Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation. "tsne" (default) t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding. ''' os.mkdir('clustering') os.chdir('clustering') # tSNE plt.figure() viz = Manifold(manifold="tsne", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="tsne.png") plt.close() # os.system('open tsne.png') # viz.show() # PCA plt.figure() visualizer = PCADecomposition(scale=True, classes=set(classes)) visualizer.fit_transform(np.array(features), tclass_labels) visualizer.poof(outpath="pca.png") plt.close() # os.system('open pca.png') # spectral embedding plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # lle embedding plt.figure() viz = Manifold(manifold="lle", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="lle.png") plt.close() # ltsa # plt.figure() # viz = Manifold(manifold="ltsa", classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="ltsa.png") # plt.close() # hessian # plt.figure() # viz = Manifold(manifold="hessian", method='dense', classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="hessian.png") # plt.close() # modified plt.figure() viz = Manifold(manifold="modified", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="modified.png") plt.close() # isomap plt.figure() viz = Manifold(manifold="isomap", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="isomap.png") plt.close() # mds plt.figure() viz = Manifold(manifold="mds", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="mds.png") plt.close() # spectral plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # UMAP embedding plt.figure() umap = UMAPVisualizer(metric='cosine', classes=set(classes), title="UMAP embedding") umap.fit_transform(np.array(features), class_labels) umap.poof(outpath="umap.png") plt.close() # alternative UMAP # import umap.plot # plt.figure() # mapper = umap.UMAP().fit(np.array(features)) # fig=umap.plot.points(mapper, labels=np.array(tclass_labels)) # fig = fig.get_figure() # fig.tight_layout() # fig.savefig('umap2.png') # plt.close(fig) ################################# # FEATURE RANKING!! ################################# os.chdir(curdir) os.mkdir('feature_ranking') os.chdir('feature_ranking') # You can get the feature importance of each feature of your dataset # by using the feature importance property of the model. plt.figure(figsize=(12, 12)) model = ExtraTreesClassifier() model.fit(np.array(features), tclass_labels) # print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=feature_labels[0]) feat_importances.nlargest(20).plot(kind='barh') plt.title('Feature importances (ExtraTrees)', size=16) plt.title('Feature importances with %s features' % (str(len(features[0])))) plt.tight_layout() plt.savefig('feature_importance.png') plt.close() # os.system('open feature_importance.png') # get selected labels for top 20 features selectedlabels = list(dict(feat_importances.nlargest(20))) new_features, new_labels = restructure_features(selectedlabels, t_features, feature_labels[0]) new_features_, new_labels_ = restructure_features(selectedlabels, features, feature_labels[0]) # Shapiro rank algorithm (1D) plt.figure(figsize=(28, 12)) visualizer = Rank1D(algorithm='shapiro', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) # plt.tight_layout() visualizer.poof(outpath="shapiro.png") plt.title('Shapiro plot (top 20 features)', size=16) plt.close() # os.system('open shapiro.png') # visualizer.show() # pearson ranking algorithm (2D) plt.figure(figsize=(12, 12)) visualizer = Rank2D(algorithm='pearson', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) plt.tight_layout() visualizer.poof(outpath="pearson.png") plt.title('Pearson ranking plot (top 20 features)', size=16) plt.close() # os.system('open pearson.png') # visualizer.show() # feature importances with top 20 features for Lasso plt.figure(figsize=(12, 12)) viz = FeatureImportances(Lasso(), labels=new_labels_) viz.fit(np.array(new_features_), tclass_labels) plt.tight_layout() viz.poof(outpath="lasso.png") plt.close() # correlation plots with feature removal if corr > 0.90 # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf # now remove correlated features # --> p values # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix # --> https://seaborn.pydata.org/tutorial/distributions.html data = new_features corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap with correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap.png') plt.close(fig) columns = np.full((corr.shape[0], ), True, dtype=bool) for i in range(corr.shape[0]): for j in range(i + 1, corr.shape[0]): if corr.iloc[i, j] >= 0.9: if columns[j]: columns[j] = False selected_columns = data.columns[columns] data = data[selected_columns] corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap without correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap_clean.png') plt.close(fig) # radviz # Instantiate the visualizer plt.figure(figsize=(12, 12)) visualizer = RadViz(classes=classes, features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) visualizer.poof(outpath="radviz.png") visualizer.show() plt.close() # feature correlation plot plt.figure(figsize=(28, 12)) visualizer = feature_correlation(np.array(new_features), tclass_labels, labels=new_labels) visualizer.poof(outpath="correlation.png") visualizer.show() plt.tight_layout() plt.close() os.mkdir('feature_plots') os.chdir('feature_plots') newdata = new_features_ newdata['classes'] = class_labels for j in range(len(new_labels_)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels_[j])) plt.close(fig) os.mkdir('feature_plots_transformed') os.chdir('feature_plots_transformed') newdata = new_features newdata['classes'] = class_labels for j in range(len(new_labels)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels[j])) plt.close(fig) ################################################## # PRECISION-RECALL CURVES ################################################## os.chdir(curdir) os.mkdir('model_selection') os.chdir('model_selection') plt.figure() visualizer = precision_recall_curve(GaussianNB(), np.array(features), tclass_labels) visualizer.poof(outpath="precision-recall.png") plt.close() plt.figure() visualizer = roc_auc(LogisticRegression(), np.array(features), tclass_labels) visualizer.poof(outpath="roc_curve_train.png") plt.close() plt.figure() visualizer = discrimination_threshold( LogisticRegression(multi_class="auto", solver="liblinear"), np.array(features), tclass_labels) visualizer.poof(outpath="thresholds.png") plt.close() plt.figure() visualizer = residuals_plot(Ridge(), np.array(features), tclass_labels, train_color="maroon", test_color="gold") visualizer.poof(outpath="residuals.png") plt.close() plt.figure() visualizer = prediction_error(Lasso(), np.array(features), tclass_labels) visualizer.poof(outpath='prediction_error.png') plt.close() # outlier detection plt.figure() visualizer = cooks_distance(np.array(features), tclass_labels, draw_threshold=True, linefmt="C0-", markerfmt=",") visualizer.poof(outpath='outliers.png') plt.close() # cluster numbers plt.figure() visualizer = silhouette_visualizer( KMeans(len(set(tclass_labels)), random_state=42), np.array(features)) visualizer.poof(outpath='siloutte.png') plt.close() # cluster distance plt.figure() visualizer = intercluster_distance( KMeans(len(set(tclass_labels)), random_state=777), np.array(features)) visualizer.poof(outpath='cluster_distance.png') plt.close() # plot percentile of features plot with SVM to see which percentile for features is optimal features = preprocessing.MinMaxScaler().fit_transform(features) clf = Pipeline([('anova', SelectPercentile(chi2)), ('scaler', StandardScaler()), ('logr', LogisticRegression())]) score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, np.array(features), class_labels) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the LogisticRegression-Anova varying the percent features selected' ) plt.xticks(np.linspace(0, 100, 11, endpoint=True)) plt.xlabel('Percentile') plt.ylabel('Accuracy Score') plt.axis('tight') plt.savefig('logr_percentile_plot.png') plt.close() # get PCA pca = PCA(random_state=1) pca.fit(X_train) skplt.decomposition.plot_pca_component_variance(pca) plt.savefig('pca_explained_variance.png') plt.close() # estimators rf = RandomForestClassifier() skplt.estimators.plot_learning_curve(rf, X_train, y_train) plt.title('Learning Curve (Random Forest)') plt.savefig('learning_curve.png') plt.close() # elbow plot kmeans = KMeans(random_state=1) skplt.cluster.plot_elbow_curve(kmeans, X_train, cluster_ranges=range(1, 30), title='Elbow plot (KMeans clustering)') plt.savefig('elbow.png') plt.close() # KS statistic (only if 2 classes) lr = LogisticRegression() lr = lr.fit(X_train, y_train) y_probas = lr.predict_proba(X_test) skplt.metrics.plot_ks_statistic(y_test, y_probas) plt.savefig('ks.png') plt.close() # precision-recall nb = GaussianNB() nb.fit(X_train, y_train) y_probas = nb.predict_proba(X_test) skplt.metrics.plot_precision_recall(y_test, y_probas) plt.tight_layout() plt.savefig('precision-recall.png') plt.close() ## plot calibration curve rf = RandomForestClassifier() lr = LogisticRegression() nb = GaussianNB() svm = LinearSVC() dt = DecisionTreeClassifier(random_state=0) ab = AdaBoostClassifier(n_estimators=100) gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) knn = KNeighborsClassifier(n_neighbors=7) rf_probas = rf.fit(X_train, y_train).predict_proba(X_test) lr_probas = lr.fit(X_train, y_train).predict_proba(X_test) nb_probas = nb.fit(X_train, y_train).predict_proba(X_test) # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test) dt_scores = dt.fit(X_train, y_train).predict_proba(X_test) ab_scores = ab.fit(X_train, y_train).predict_proba(X_test) gb_scores = gb.fit(X_train, y_train).predict_proba(X_test) knn_scores = knn.fit(X_train, y_train).predict_proba(X_test) probas_list = [ rf_probas, lr_probas, nb_probas, # svm_scores, dt_scores, ab_scores, gb_scores, knn_scores ] clf_names = [ 'Random Forest', 'Logistic Regression', 'Gaussian NB', # 'SVM', 'Decision Tree', 'Adaboost', 'Gradient Boost', 'KNN' ] skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names) plt.savefig('calibration.png') plt.tight_layout() plt.close() # pick classifier type by ROC (without optimization) probs = [ rf_probas[:, 1], lr_probas[:, 1], nb_probas[:, 1], # svm_scores[:, 1], dt_scores[:, 1], ab_scores[:, 1], gb_scores[:, 1], knn_scores[:, 1] ] plot_roc_curve(y_test, probs, clf_names) # more elaborate ROC example with CV = 5 fold # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py os.chdir(curdir) return ''
df_processed, num_features, cat_features = EDA(df, labels, target_variable_name, data_summary_figsize=(6, 6), corr_matrix_figsize=(6, 6), corr_matrix_annot=True, pairplt=True) #dividing the X and the y X = df_processed.drop([target_variable_name], axis=1) y = df_processed[target_variable_name] #RadViz plot from yellowbrick.features import RadViz visualizer = visualizer = RadViz(classes=labels, features=X.columns.tolist(), size=(800, 300)) visualizer.fit(X, y) visualizer.transform(X) visualizer.show() # Split the data into training and testing sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier
oz = Rank2D(features=cols) oz.fit_transform(X, y) oz.poof() oz = Rank2D(features=cols, algorithm='covariance') oz.fit_transform(X, y) oz.poof() g = sns.jointplot(x='review_count', y='rating', kind='hex', data=data) h = sns.jointplot(x='price', y='rating', kind='hex', data=data) label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) oz = RadViz(classes=label_encoder.classes_, features=cols) oz.fit(X, y) oz.poof() oz = ParallelCoordinates(classes=label_encoder.classes_, features=cols) oz.fit(X, y) oz.poof() oz = ParallelCoordinates(normalize='minmax', classes=label_encoder.classes_, features=cols) oz.fit(X, y) oz.poof() df = pd.DataFrame(data) numeric_features = [ "price", "rating",
locationFileNameFC = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx])+'_idx_'+str(idx) \ +'_label_'+str(labelName)+'_date_'+str(dateIdx)+'_label_'+str(labelsIdx)+'_FeatureCorrelation_w_depn_var.png') plt.xlabel('', fontsize=11) plt.xticks(fontsize=14) plt.yticks(fontsize=12) visualizerFC.show(outpath=locationFileNameFC) plt.show() # # # Instantiate the visualizer set_palette('yellowbrick') plt.figure() classes = np.array([0, 1.]) plt.xticks(fontsize=9) visualizerRadViz = RadViz(classes=classes, features=features, title=' ') visualizerRadViz.fit(X, y) # Fit the data to the visualizer visualizerRadViz.transform(X) # Transform the data locationFileNameRVZ = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx]) \ +'_idx_'+str(idx)+'_label_'+str(labelsIdx)+'_date_'+str(dateIdx)+'_radviz.png') visualizerRadViz.show(outpath=locationFileNameRVZ) plt.show() ## MDS # Instantiate the clustering model and visualizer model = KMeans(6) plt.figure() plt.xlabel('features', fontsize=12) plt.ylabel('features', fontsize=12)