def test_rank2d(seaborn=False, outpath=None): """ Runs the radviz visualizer on the dataset. Parameters ---------- pandas : bool Run the pandas version of the function outpath : path or None Save the figure to disk rather than show (if None) """ data = load_data('occupancy') # Load the data features = ['temp', 'humid', 'light', 'co2', 'hratio'] classes = ['unoccupied', 'occupied'] X = data[features].as_matrix() y = data.occupied.as_matrix() if seaborn: raise NotImplementedError("Not yet!") else: visualizer = Rank2D(features=features, algorithm='covariance') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath=outpath) # Draw/show/poof the data
def generate_rank_2d(self, X, algorithm='pearson', **kwargs): """ Given the entire (train+test) input features, returns a plotly Heatmap figure showing the feature x feature correlation. :param X: the input features to the model :param algorithm: the algorithm to calculate the importance with (pearson, covariance, spearman, kendalltau) """ visualizer = Rank2D(algorithm=algorithm) visualizer.fit_transform(X) # values ranks_ = visualizer.ranks_ #grabbing feature shape feats = ranks_.shape[0] # zero-ing out one of the diagonals features #iu = np.triu_indices(feats, ) #ranks_[iu] = 0 fig = go.Figure([ go.Heatmap(z=ranks_, x=self.feature_names, y=self.feature_names, **kwargs) ]) return fig
def rank_2d(features, algorithm, X, y): from yellowbrick.features import Rank2D # Instantiate the visualizer with the Covariance ranking algorithm visualizer = Rank2D(features=features, algorithm=algorithm) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof() # Draw/show/poof the data
def visualizeFeatureImportance(features, labels): # Instantiate the visualizer with the Covariance ranking algorithm visualizer = Rank2D(algorithm='covariance') visualizer.fit(features.drop(["appid", "name"], axis=1), list(map(convertLabelToNumber, labels))) # Fit the data to the visualizer visualizer.transform(features.drop(["appid", "name"], axis=1)) # Transform the data visualizer.poof() # Draw/show/poof the data
def explore_features(df): df_copy = df.copy() #for some reason, the visualize doesn't accept categorical #variables. those have to be converted to strings for (col, data) in df_copy.iteritems(): if df_copy[col].dtype.name == "category": df_copy[col] = df_copy[col].astype(str) numeric_df = autoclean(df_copy) visualizer = Rank2D(algorithm="pearson") visualizer.fit_transform(numeric_df) visualizer.poof()
def testFunc5(savepath='Results/bikeshare_Rank2D.png'): ''' 共享单车数据集预测 ''' data = pd.read_csv('fixtures/bikeshare/bikeshare.csv') X = data[[ "season", "month", "hour", "holiday", "weekday", "workingday", "weather", "temp", "feelslike", "humidity", "windspeed" ]] Y = data["riders"] visualizer = Rank2D(algorithm="pearson") visualizer.fit_transform(X) visualizer.poof(outpath=savepath)
def Corr_vision(X): """ Correlation visualization according to Pearson Parameters ---------- X: matrix of features Returns ------- - A plot with correlation features """ fig, ax = plt.subplots(figsize=(20, 20)) visualizer = Rank2D(algorithm="pearson") visualizer.fit_transform(X) #visualizer.show('corr_matrix') // to output png plt.show()
def rank2d(ax, algorithm='pearson'): from yellowbrick.features import Rank2D # Specify the features of interest features = [ 'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay', 'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill', 'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay', 'jul_pay', 'aug_pay', 'sep_pay', ] # Load the data X, y = load_data('credit', cols=features, target='default') # Instantiate and fit the visualizer visualizer = Rank2D(features=features, algorithm=algorithm) visualizer.title = "2D Ranking of Pairs of Features by {}".format( algorithm.title()) visualizer.fit(X, y) visualizer.transform(X) return visualizer
def rank2(data, name=name, location=location, dcol=dcol, algorithm=algorithm, colormap=colormap, show=show): df_data = data.drop(dcol, axis=1) df_data = df_data.astype(float) ax = plt.axes() rk2d2 = Rank2D(ax=ax, algorithm=algorithm, show_feature_names=show, size=(1080, 720), colormap=colormap) ax.set_title(name) rk2d2.fit(df_data) rk2d2.transform(df_data) rk2d2.show(outpath=os.path.join(location, f"Correlation_{algorithm}_{name}.png")) plt.close() return name
def feature_analysis(fname="feature_analysis.png"): """ Create figures for feature analysis """ # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Draw RadViz on the left data = load_occupancy(split=False) oz = RadViz(ax=axes[0], classes=["unoccupied", "occupied"]) oz.fit(data.X, data.y) oz.finalize() # Draw Rank2D on the right data = load_concrete(split=False) oz = Rank2D(ax=axes[1]) oz.fit_transform(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
# Unpickle model lasso = joblib.load('../lasso_total.pkl') """ Visualizations to create: 1. Rank2d Pearson Ranking of Features 2. Feature Importance 3. Residuals plot 4. Actual vs. Predicted with prediction error 5. Alpha Selection """ # Rank2d (naive, 18 variable case) fig = plt.figure() ax = fig.add_subplot() rank = Rank2D(features=feature_cols, algorithm='pearson', ax=ax) Xt = Xtrain[feature_cols] rank.fit(Xt, ytrain) rank.transform(Xt) rank.poof(outpath="lasso_rank2d.png") # Feature Importances (naive, 18 variable case) fig = plt.figure() ax = fig.add_subplot() featimp = FeatureImportances(lasso, ax=ax) featimp.fit(Xt, ytrain) featimp.poof(outpath="lasso_featureimportances18.png") # Residuals Plot fig = plt.figure() ax = fig.add_subplot()
visualizer.fit(y) visualizer.poof() # %% visualizer = ParallelCoordinates() visualizer.fit_transform(X, y) visualizer.poof() # %% visualizer = Rank1D() visualizer.fit(X, y) visualizer.transform(X) visualizer.poof() # %% visualizer = Rank2D() visualizer.fit_transform(X) visualizer.poof() # %% visualizer = FeatureCorrelation() visualizer.fit(X, y) visualizer.poof() # %% visualizer = FeatureCorrelation(method='mutual_info-classification') visualizer.fit(X, y) visualizer.poof() # %% visualizer = RadViz(classes=class_names)
def rank2d(): X, y = load_credit() oz = Rank2D(algorithm="covariance", ax=newfig()) oz.fit_transform(X, y) savefig(oz, "rank2d_covariance")
plt.ylabel('lambda_sigma', fontsize=14) plt.xlabel('lambda_weight', fontsize=14) locationFileNameJPV = os.path.join( '/home/ak/Documents/Research/Papers/figures', str(symbols[symbolIdx]) + '_idx_' + str(idx) + 'date' + str(dateIdx) + '_label' + str(labelName) + '_jointplotViz.png') visualizerJPV.show(outpath=locationFileNameJPV) plt.show() # # Instantiate the visualizer with the Covariance ranking algorithm set_palette('sns_dark') plt.figure() visualizerR2D = Rank2D(features=features, algorithm='pearson', title=' ') visualizerR2D.fit(X, y) # Fit the data to the visualizer visualizerR2D.transform(X) # Transform the data plt.xticks(fontsize=12) plt.yticks(fontsize=12) locationFileNameR2D = os.path.join( '/home/ak/Documents/Research/Papers/figures', str(symbols[symbolIdx]) + '_idx_' + str(idx) + '_label' + str(labelName) + '_date_' + str(dateIdx) + '_pearsonCorrel.png') visualizerR2D.show(outpath=locationFileNameR2D) plt.show() my_title = " "
dataset = pd.DataFrame({'Lexical_Diversity':hh[:,0],'Brunet_Index':hh[:,1],'Honore_Satistic':hh[:,2],'Flesch Reading':hh[:,3], 'Flesch-Kincaid':hh[:,4] }) SZ_Type = ['Incoherence', 'Incoherence','Incoherence','Incoherence','Incoherence', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality','Tangentiality' ] dataset['SZ_Type']=SZ_Type import seaborn as sns sns.set(style="ticks") # sns.pairplot(dataset, hue='SZ_Type', size=1.75) from yellowbrick.features import Rank2D features = ['Lexical_Diversity','Brunet_Index', 'Honore_Satistic', 'Flesch Reading', 'Flesch-Kincaid'] # Instantiate the visualizer with the Pearson ranking algorithm visualizer = Rank2D(features=features, algorithm='pearson') X=np.transpose(h) Y = np.asarray([0, 0 ,0, 0, 0, 1,1,1, 1,1,1, 1,1,1,1]) visualizer.fit(X, Y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof() from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_validate from sklearn.metrics import recall_score from sklearn.metrics import accuracy_score
frames = [morg_2_train_strs_broken, activ_inact_train] import pandas as pd dfrad = pd.concat(frames, axis=1) dfrad = dfrad.dropna() #dfrad.iloc[:,[2048]] #dfrad.iloc[:,:100] #CLASS BALANCE - No balanced from yellowbrick.target import ClassBalance visCB = ClassBalance(labels=[1, 0]) visCB.fit(dfrad['activities']) #Fit the data to the visualizer visCB.show() #Finalize and render the figure #RANK 2D "Pearson correlation" -No balanced from yellowbrick.features import Rank2D visualizer = Rank2D(algorithm='pearson') visualizer.fit(dfrad.iloc[:, :50], dfrad['activities']) # Fit the data to the visualizer visualizer.transform(dfrad.iloc[:, :50]) # Transform the data visualizer.show() # Finalize and render the figure #MANIFOLD - No balanced from yellowbrick.features import Manifold classes = [1, 0] from sklearn import preprocessing label_encoder = preprocessing.LabelEncoder( ) #label_encoder object knows how to understand word labels. dfrad['activities'] = label_encoder.fit_transform( dfrad['activities']) #Encode labels dfrad['activities'].unique() viz = Manifold(manifold="tsne", classes=classes) # Instantiate the visualizer
features = [ 'price', 'rating', 'review_count', 'high_risk_1', 'medium_risk_2', 'low_risk_2', 'is_pickup', 'is_delivery', 'is_restaurant_reservation', 'Canvass', 'Complaint', 'reinspection', 'License', 'FoodPoison', 'is_pickup', 'is_delivery', 'is_restaurant_reservation' ] X = data[features] y = data['pass'] visualizer = Rank1D(features=features, algorithm='shapiro') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath="1D_features.png") # Draw/show/poof the data #2D visualizer = Rank2D(features=features, algorithm='covariance') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath="2D_features.png") # Draw/show/poof the data #1D with other features but including rating features = ['rating', 'is_african', 'is_asian_fusion', 'is_bakeries', 'is_bars', 'is_breakfast_brunch', 'is_buffets', 'is_cafes', 'is_caribbean', 'is_chinese', 'is_deli', 'is_eastern_european', 'is_european', 'is_fast_food', 'is_hawaiian', 'is_health_food', 'is_icecream', ] X = data[features] y = data['pass']
def pearson_features(ml_array_): feat_visualizer = Rank2D(algorithm="pearson") feat_visualizer.fit_transform(ml_array_) feat_visualizer.show()
#Create an Explainer inhertance of the data explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns.tolist(), class_names=y_train.unique()) # Create a lambda function to use model to predict the data predict_fn = lambda x: model.predict_proba(x).astype(float) #using Explainer to Explain the predictions exp = explainer.explain_instence(X_test.values[0], predict_fn, num_features=6) exp.show_in_notebook(show_all=False) """## Yellowbrick""" # HeatMap for Co-Relation visualizer = Rank2D(algorithm="pearson", size=(1080, 720)) visualizer.fit_transform(X_train) visualizer.poof() # Evaluation Metrics visualizer = ClassificationReport(model, size=(1080, 720)) visualizer.fit(X_train, y_train) visualizer.score(X_train, y_train) visualizer.poff() """# Using API in WebApp (Flask)""" # Commented out IPython magic to ensure Python compatibility. # %%writefile server.py #
'Complaint', 'reinspection', 'License', 'FoodPoison', 'high_risk_1', 'medium_risk_2', 'low_risk_2', 'grocery', 'Bakery', 'Mobile'] X = data[cols] y = data['pass'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state = 42) clf = linear_model.Lasso(alpha=0.5) clf.fit(X_train, y_train) clf.predict(X_test) visualizer = PredictionError(Lasso()) visualizer.fit(X_train, y_train) oz = Rank2D(features=cols) oz.fit_transform(X, y) oz.poof() oz = Rank2D(features=cols, algorithm='covariance') oz.fit_transform(X, y) oz.poof() g = sns.jointplot(x='review_count', y='rating', kind='hex', data=data) h = sns.jointplot(x='price', y='rating', kind='hex', data=data) label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) oz = RadViz(classes=label_encoder.classes_, features=cols) oz.fit(X, y)
import pandas as pd from yellowbrick.features import Rank2D data = pd.read_csv('../CSV/bikeshare.csv') X = data[[ "season", "month", "hour", "holiday", "weekday", "workingday", "weather", "temp", "feelslike", "humidity", "windspeed" ]] y = data["riders"] visualizer = Rank2D(algorithm="pearson") visualizer.fit_transform(X) visualizer.poof() ''' This figure shows us the Pearson correlation between pairs of features such that each cell in the grid represents two features identified in order on the x and y axes and whose color displays the magnitude of the correlation. A Pearson correlation of 1.0 means that there is a strong positive, linear relationship between the pairs of variables and a value of -1.0 indicates a strong negative, linear relationship (a value of zero indicates no relationship). Therefore we are looking for dark red and dark blue boxes to identify further. In this chart, we see that the features temp and feelslike have a strong correlation and also that the feature season has a strong correlation with the feature month. This seems to make sense; the apparent temperature we feel outside depends on the actual temperature and other airquality factors, and the season of the year is described by the month! '''
def visualize_features(classes, problem_type, curdir, default_features, balance_data, test_size): # make features into label encoder here features, feature_labels, class_labels = get_features( classes, problem_type, default_features, balance_data) # now preprocess features for all the other plots os.chdir(curdir) le = preprocessing.LabelEncoder() le.fit(class_labels) tclass_labels = le.transform(class_labels) # process features to help with clustering se = preprocessing.StandardScaler() t_features = se.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, tclass_labels, test_size=test_size, random_state=42) # print(len(features)) # print(len(feature_labels)) # print(len(class_labels)) # print(class_labels) # GET TRAINING DATA DURING MODELING PROCESS ################################## # get filename # csvfile='' # print(classes) # for i in range(len(classes)): # csvfile=csvfile+classes[i]+'_' # get training and testing data for later # try: # print('loading training files...') # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv') # y_train=X_train['class_'] # X_train.drop(['class_'], axis=1) # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv') # y_test=X_test['class_'] # X_test.drop(['class_'], axis=1) # y_train=le.inverse_transform(y_train) # y_test=le.inverse_transform(y_test) # except: # print('error loading in training files, making new test data') # Visualize each class (quick plot) ################################## visualization_dir = 'visualization_session' try: os.mkdir(visualization_dir) os.chdir(visualization_dir) except: shutil.rmtree(visualization_dir) os.mkdir(visualization_dir) os.chdir(visualization_dir) objects = tuple(set(class_labels)) y_pos = np.arange(len(objects)) performance = list() for i in range(len(objects)): performance.append(class_labels.count(objects[i])) plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.xticks(rotation=90) plt.title('Counts per class') plt.ylabel('Count') plt.xlabel('Class') plt.tight_layout() plt.savefig('classes.png') plt.close() # set current directory curdir = os.getcwd() # ################################## # # CLUSTERING!!! # ################################## ################################## # Manifold type options ################################## ''' "lle" Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures. "ltsa" LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances. "hessian" Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood "modified" Modified LLE applies a regularization parameter to LLE. "isomap" Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance. "mds" MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding. "spectral" Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation. "tsne" (default) t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding. ''' os.mkdir('clustering') os.chdir('clustering') # tSNE plt.figure() viz = Manifold(manifold="tsne", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="tsne.png") plt.close() # os.system('open tsne.png') # viz.show() # PCA plt.figure() visualizer = PCADecomposition(scale=True, classes=set(classes)) visualizer.fit_transform(np.array(features), tclass_labels) visualizer.poof(outpath="pca.png") plt.close() # os.system('open pca.png') # spectral embedding plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # lle embedding plt.figure() viz = Manifold(manifold="lle", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="lle.png") plt.close() # ltsa # plt.figure() # viz = Manifold(manifold="ltsa", classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="ltsa.png") # plt.close() # hessian # plt.figure() # viz = Manifold(manifold="hessian", method='dense', classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="hessian.png") # plt.close() # modified plt.figure() viz = Manifold(manifold="modified", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="modified.png") plt.close() # isomap plt.figure() viz = Manifold(manifold="isomap", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="isomap.png") plt.close() # mds plt.figure() viz = Manifold(manifold="mds", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="mds.png") plt.close() # spectral plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # UMAP embedding plt.figure() umap = UMAPVisualizer(metric='cosine', classes=set(classes), title="UMAP embedding") umap.fit_transform(np.array(features), class_labels) umap.poof(outpath="umap.png") plt.close() # alternative UMAP # import umap.plot # plt.figure() # mapper = umap.UMAP().fit(np.array(features)) # fig=umap.plot.points(mapper, labels=np.array(tclass_labels)) # fig = fig.get_figure() # fig.tight_layout() # fig.savefig('umap2.png') # plt.close(fig) ################################# # FEATURE RANKING!! ################################# os.chdir(curdir) os.mkdir('feature_ranking') os.chdir('feature_ranking') # You can get the feature importance of each feature of your dataset # by using the feature importance property of the model. plt.figure(figsize=(12, 12)) model = ExtraTreesClassifier() model.fit(np.array(features), tclass_labels) # print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=feature_labels[0]) feat_importances.nlargest(20).plot(kind='barh') plt.title('Feature importances (ExtraTrees)', size=16) plt.title('Feature importances with %s features' % (str(len(features[0])))) plt.tight_layout() plt.savefig('feature_importance.png') plt.close() # os.system('open feature_importance.png') # get selected labels for top 20 features selectedlabels = list(dict(feat_importances.nlargest(20))) new_features, new_labels = restructure_features(selectedlabels, t_features, feature_labels[0]) new_features_, new_labels_ = restructure_features(selectedlabels, features, feature_labels[0]) # Shapiro rank algorithm (1D) plt.figure(figsize=(28, 12)) visualizer = Rank1D(algorithm='shapiro', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) # plt.tight_layout() visualizer.poof(outpath="shapiro.png") plt.title('Shapiro plot (top 20 features)', size=16) plt.close() # os.system('open shapiro.png') # visualizer.show() # pearson ranking algorithm (2D) plt.figure(figsize=(12, 12)) visualizer = Rank2D(algorithm='pearson', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) plt.tight_layout() visualizer.poof(outpath="pearson.png") plt.title('Pearson ranking plot (top 20 features)', size=16) plt.close() # os.system('open pearson.png') # visualizer.show() # feature importances with top 20 features for Lasso plt.figure(figsize=(12, 12)) viz = FeatureImportances(Lasso(), labels=new_labels_) viz.fit(np.array(new_features_), tclass_labels) plt.tight_layout() viz.poof(outpath="lasso.png") plt.close() # correlation plots with feature removal if corr > 0.90 # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf # now remove correlated features # --> p values # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix # --> https://seaborn.pydata.org/tutorial/distributions.html data = new_features corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap with correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap.png') plt.close(fig) columns = np.full((corr.shape[0], ), True, dtype=bool) for i in range(corr.shape[0]): for j in range(i + 1, corr.shape[0]): if corr.iloc[i, j] >= 0.9: if columns[j]: columns[j] = False selected_columns = data.columns[columns] data = data[selected_columns] corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap without correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap_clean.png') plt.close(fig) # radviz # Instantiate the visualizer plt.figure(figsize=(12, 12)) visualizer = RadViz(classes=classes, features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) visualizer.poof(outpath="radviz.png") visualizer.show() plt.close() # feature correlation plot plt.figure(figsize=(28, 12)) visualizer = feature_correlation(np.array(new_features), tclass_labels, labels=new_labels) visualizer.poof(outpath="correlation.png") visualizer.show() plt.tight_layout() plt.close() os.mkdir('feature_plots') os.chdir('feature_plots') newdata = new_features_ newdata['classes'] = class_labels for j in range(len(new_labels_)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels_[j])) plt.close(fig) os.mkdir('feature_plots_transformed') os.chdir('feature_plots_transformed') newdata = new_features newdata['classes'] = class_labels for j in range(len(new_labels)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels[j])) plt.close(fig) ################################################## # PRECISION-RECALL CURVES ################################################## os.chdir(curdir) os.mkdir('model_selection') os.chdir('model_selection') plt.figure() visualizer = precision_recall_curve(GaussianNB(), np.array(features), tclass_labels) visualizer.poof(outpath="precision-recall.png") plt.close() plt.figure() visualizer = roc_auc(LogisticRegression(), np.array(features), tclass_labels) visualizer.poof(outpath="roc_curve_train.png") plt.close() plt.figure() visualizer = discrimination_threshold( LogisticRegression(multi_class="auto", solver="liblinear"), np.array(features), tclass_labels) visualizer.poof(outpath="thresholds.png") plt.close() plt.figure() visualizer = residuals_plot(Ridge(), np.array(features), tclass_labels, train_color="maroon", test_color="gold") visualizer.poof(outpath="residuals.png") plt.close() plt.figure() visualizer = prediction_error(Lasso(), np.array(features), tclass_labels) visualizer.poof(outpath='prediction_error.png') plt.close() # outlier detection plt.figure() visualizer = cooks_distance(np.array(features), tclass_labels, draw_threshold=True, linefmt="C0-", markerfmt=",") visualizer.poof(outpath='outliers.png') plt.close() # cluster numbers plt.figure() visualizer = silhouette_visualizer( KMeans(len(set(tclass_labels)), random_state=42), np.array(features)) visualizer.poof(outpath='siloutte.png') plt.close() # cluster distance plt.figure() visualizer = intercluster_distance( KMeans(len(set(tclass_labels)), random_state=777), np.array(features)) visualizer.poof(outpath='cluster_distance.png') plt.close() # plot percentile of features plot with SVM to see which percentile for features is optimal features = preprocessing.MinMaxScaler().fit_transform(features) clf = Pipeline([('anova', SelectPercentile(chi2)), ('scaler', StandardScaler()), ('logr', LogisticRegression())]) score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, np.array(features), class_labels) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the LogisticRegression-Anova varying the percent features selected' ) plt.xticks(np.linspace(0, 100, 11, endpoint=True)) plt.xlabel('Percentile') plt.ylabel('Accuracy Score') plt.axis('tight') plt.savefig('logr_percentile_plot.png') plt.close() # get PCA pca = PCA(random_state=1) pca.fit(X_train) skplt.decomposition.plot_pca_component_variance(pca) plt.savefig('pca_explained_variance.png') plt.close() # estimators rf = RandomForestClassifier() skplt.estimators.plot_learning_curve(rf, X_train, y_train) plt.title('Learning Curve (Random Forest)') plt.savefig('learning_curve.png') plt.close() # elbow plot kmeans = KMeans(random_state=1) skplt.cluster.plot_elbow_curve(kmeans, X_train, cluster_ranges=range(1, 30), title='Elbow plot (KMeans clustering)') plt.savefig('elbow.png') plt.close() # KS statistic (only if 2 classes) lr = LogisticRegression() lr = lr.fit(X_train, y_train) y_probas = lr.predict_proba(X_test) skplt.metrics.plot_ks_statistic(y_test, y_probas) plt.savefig('ks.png') plt.close() # precision-recall nb = GaussianNB() nb.fit(X_train, y_train) y_probas = nb.predict_proba(X_test) skplt.metrics.plot_precision_recall(y_test, y_probas) plt.tight_layout() plt.savefig('precision-recall.png') plt.close() ## plot calibration curve rf = RandomForestClassifier() lr = LogisticRegression() nb = GaussianNB() svm = LinearSVC() dt = DecisionTreeClassifier(random_state=0) ab = AdaBoostClassifier(n_estimators=100) gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) knn = KNeighborsClassifier(n_neighbors=7) rf_probas = rf.fit(X_train, y_train).predict_proba(X_test) lr_probas = lr.fit(X_train, y_train).predict_proba(X_test) nb_probas = nb.fit(X_train, y_train).predict_proba(X_test) # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test) dt_scores = dt.fit(X_train, y_train).predict_proba(X_test) ab_scores = ab.fit(X_train, y_train).predict_proba(X_test) gb_scores = gb.fit(X_train, y_train).predict_proba(X_test) knn_scores = knn.fit(X_train, y_train).predict_proba(X_test) probas_list = [ rf_probas, lr_probas, nb_probas, # svm_scores, dt_scores, ab_scores, gb_scores, knn_scores ] clf_names = [ 'Random Forest', 'Logistic Regression', 'Gaussian NB', # 'SVM', 'Decision Tree', 'Adaboost', 'Gradient Boost', 'KNN' ] skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names) plt.savefig('calibration.png') plt.tight_layout() plt.close() # pick classifier type by ROC (without optimization) probs = [ rf_probas[:, 1], lr_probas[:, 1], nb_probas[:, 1], # svm_scores[:, 1], dt_scores[:, 1], ab_scores[:, 1], gb_scores[:, 1], knn_scores[:, 1] ] plot_roc_curve(y_test, probs, clf_names) # more elaborate ROC example with CV = 5 fold # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py os.chdir(curdir) return ''
#データの先頭10行表示 print(CSV_data.head(10)) #統計量の確認 print(CSV_data.describe()) #説明変数X と 目的変数yへの分割 #説明変数X=すべての行(:),1列目と2列目([0,1]) print("説明変数") X = CSV_data.loc[:, ['right', 'left']] print(X) #目的変数y=すべての行(:),3,4,5列目(2) print("目的変数") y = CSV_data.loc[:, ['wa', 'sa', 'seki']] print(y) visualiser = Rank2D(alorithm='pearson') visualiser.fit(X, y) visualiser.transform(X) visualiser.poof() #学習データとテストデータへの分割 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ニューラルネット構築 def createModel(): print("構築") model = Sequential()
if os.environ.get('VIZ', '0') == '1': from yellowbrick.features import Rank1D, Rank2D, RadViz, ParallelCoordinates import matplotlib.pyplot as plt print("Rank1D...") features = train_columns visualizer = Rank1D(features=features, algorithm='shapiro') visualizer.fit(X, Y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath='viz_feature_rank1d.pdf', bbox_inches='tight') plt.close('all') feature_diversity = visualizer.ranks_ # Instantiate the visualizer with the Covariance ranking algorithm print("Rank2D...") visualizer = Rank2D(features=features, algorithm='spearman') visualizer.fit(X, Y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath='viz_feature_rank2d.pdf', bbox_inches='tight') plt.close('all') """ # reorder the features so similar ones are together features_to_handle = list(range(len(features))) print(features_to_handle) features_ordered = [] last_feature = 0 numpy.random.seed(1) while features_to_handle: print("%d ..." % last_feature, feature_distance.shape) invdists = 1. / (visualizer.ranks_[last_feature,features_to_handle]**2 + 1e-5) invdists /= invdists.sum()