def plot_uturns(summary, critical_rad=2.9, save=False, condition='Condition', context='notebook'): """Plot and print steepest turns over more than critical_rad""" turn_column = next(col for col in summary.columns if col.startswith('Max. Turn Over')) columns_of_interest = ['Skew Lines Distance', 'Mean Velocity', 'Arrest Coefficient', condition, turn_column] uturns = summary[summary[turn_column] > critical_rad] skip_steps = int(next(word for word in turn_column.split() if word.isdigit())) print('\nPlotting turns with more than {} rad over {} steps'.format( critical_rad, skip_steps)) for cond, cond_uturns in uturns.groupby(condition): n_tracks = len(summary[summary[condition] == cond]) n_turns = len(cond_uturns) print(' {} tracks in {} with {} U-Turns ({:2.2f} %).'.format( n_tracks, cond, n_turns, n_turns/n_tracks*100)) sns.set(style='white', context=context) sns.pairplot(uturns[columns_of_interest], hue=condition, diag_kind='kde') plt.tight_layout() if save: conditions = [cond.replace('= ', '') for cond in summary[condition].unique()] plt.savefig('U-Turns_' + '-'.join(conditions) + '_{:1.1f}over{}steps.png'.format(critical_rad, skip_steps), dpi=300) else: plt.show()
def useDataAnalysis(df_train_origin): # 风速 df_train_origin.groupby('windspeed').mean().plot(y='count', marker='o') # plt.show() # 湿度 df_train_origin.groupby('humidity').mean().plot(y='count', marker='o') # plt.show() # 温度 df_train_origin.groupby('temp').mean().plot(y='count', marker='o') # plt.show() # 温度湿度变化 df_train_origin.plot(x='temp', y='humidity', kind='scatter') # plt.show() # scatter一下各个维度 fig, axs = plt.subplots(2, 3, sharey=True) df_train_origin.plot(kind='scatter', x='temp', y='count', ax=axs[0, 0], figsize=(16, 8), color='magenta') df_train_origin.plot(kind='scatter', x='atemp', y='count', ax=axs[0, 1], color='cyan') df_train_origin.plot(kind='scatter', x='humidity', y='count', ax=axs[0, 2], color='red') df_train_origin.plot(kind='scatter', x='windspeed', y='count', ax=axs[1, 0], color='yellow') df_train_origin.plot(kind='scatter', x='month', y='count', ax=axs[1, 1], color='blue') df_train_origin.plot(kind='scatter', x='hour', y='count', ax=axs[1, 2], color='green') sns.pairplot(df_train_origin[["temp", "month", "humidity", "count"]], hue="count") # 来看看相关度咯 corr = df_train_origin[['temp', 'weather', 'windspeed', 'day', 'month', 'hour', 'count']].corr() print corr # 用颜色深浅来表示相关度 plt.figure() plt.matshow(corr) plt.colorbar() plt.show()
def pairplot(): sns.set_style('white', {'axes.grid': True, 'axes.edgecolor':'0'}) sns.set_context('paper', font_scale=1.5, rc={'lines.linewidth': 1}) # sns.despine() WT_pairplot = sns.pairplot(df_subset, vars = ['Experimental DDG', 'Predicted DDG', 'Absolute Error DDG', 'Mutant Complex REU', 'RMSD'], size =3, hue='WT PDBID', hue_order=sorted(list(mutant_df['WT PDBID'].unique())), kind='scatter', diag_kind='hist' )# .add_legend(bbox_to_anchor=(1.1, 0.5)) lgd = WT_pairplot.fig.legend(handles=df_subset['WT PDBID'], labels=df_subset['WT PDBID'], bbox_to_anchor=(1.05, 0.5)) output_pdf.attach_note('This pairplot compares the various numerical variables contained within the mutant_df dataframe. The following variables are compared in a pairwise fashion where hue is WT PDBID: Experimental DDG, Predicted DDG, Absolute Error DDG, Mutant Complex Rosetta Energy, and RMSD') title = WT_pairplot.fig.suptitle('PairPlot for %s' %description, fontsize =24, y=1.05) output_pdf.savefig(WT_pairplot.fig, pad_inches = 1, bbox_extra_artists = [title, lgd], bbox_inches='tight') Mut_pairplot = sns.pairplot(df_subset, vars = ['Experimental DDG', 'Predicted DDG', 'Absolute Error DDG', 'Mutant Complex REU', 'RMSD'], size =3, hue='Mutant PDBID', hue_order= sorted(list(mutant_df['Mutant PDBID'].unique())), kind='scatter', diag_kind='hist' )# .add_legend(bbox_to_anchor=(1.1, 0.5)) lgd = Mut_pairplot.fig.legend(handles=df_subset['Mutant PDBID'], labels=df_subset['Mutant PDBID'], bbox_to_anchor=(1.05, 0.5)) output_pdf.attach_note('This pairplot compares the various numerical variables contained within the mutant_df dataframe. The following variables are compared in a pairwise fashion where hue is Mutant PDBID: Experimental DDG, Predicted DDG, Absolute Error DDG, Mutant Complex Rosetta Energy, and RMSD') title = Mut_pairplot.fig.suptitle('PairPlot for %s' % description, fontsize=24, y=1.05) output_pdf.savefig(Mut_pairplot.fig, pad_inches = 1, bbox_extra_artists = [title, lgd], bbox_inches='tight')
def Plot_data_df(self,df_2,label,strac,pair=True): '''''' import matplotlib.pylab as plt import math import seaborn as sns import pandas as pd x = range(df_2.shape[0]) maxn = df_2.shape[1] fig = plt.figure(1,figsize=(10,maxn+10)) sns.set(style='darkgrid',color_codes=True) style = 'white' for i in xrange(maxn): y=df_2.ix[:,i] plot_context = fig.add_subplot(maxn, 2, 2*i+1) plot_context.scatter(x, y, c=label,s=10,alpha=0.7) plt.title(strac[i]) fig.add_subplot(maxn, 2, 2*i + 2) sns.distplot(y,axlabel=False) fig.tight_layout() plt.title(strac[i]) df_label=pd.DataFrame({'label':list(label)}) df_2=pd.concat([df_2,df_label],axis=1,join='inner') if pair==True: sns.pairplot(df_2,vars=strac,hue='label',) plt.show()
def feature_analysis(self): """ Make a plot to visulize important features to separate labels """ if self.is_voicedata: # explore all paired scatter plots seaborn.set_context("poster") plt.figure(figsize = (10,8)) plot_all = seaborn.pairplot(self.data, hue = "label") plt.suptitle("Feature analysis for voice dataset - all features") plot_all.savefig("../Figures/voice_exploration.png", bbox_inches="tight") # explore paired scatter plots with selected features plt.figure(figsize = (10,8)) plot_selected = seaborn.pairplot(self.data[["skew","kurt", "meanfun", \ "meanfreq", "IQR", "label"]], hue = "label") plt.suptitle("Feature analysis for voice dataset - selected features") plot_selected.savefig("../Figures/voice_exploration_selected.png", bbox_inches="tight") else: seaborn.set_context("poster") plt.figure(figsize = (10,8)) # explore all paired scatter plots plot_all = seaborn.pairplot(self.data, hue = "Self-defined label") plt.suptitle("Feature analysis for EEG dataset - all features") plot_all.savefig("../Figures/EEG_exploration.png", bbox_inches="tight") # explore paired scatter plots with selected features plt.figure(figsize = (10,8)) plot_selected = seaborn.pairplot(self.data[["Delta","Theta","Alpha 1",\ "Beta 2", "Gamma1", "Self-defined label"]], hue = "Self-defined label") plt.suptitle("Feature analysis for EEG dataset - selected features") plot_selected.savefig("../Figures/EEG_exploration_selected.png", bbox_inches="tight")
def scatter_corrplot_parameters(self,params): """ Plots two parameters from the catalog against each other. """ snsdf = self.pandas_data_frame(params) print(snsdf) sns.pairplot(snsdf,dropna=True, size=10)
def scatterplot(): '''Fancy scatterplots, using the package "seaborn" ''' import seaborn as sns df = sns.load_dataset("iris") sns.pairplot(df, hue="species", size=2.5) C2_8_mystyle.printout_plain('multiScatterplot.png')
def visualize_data(): ''' 可视化数据 ''' sns.set(style='whitegrid', context='notebook') sns.pairplot(df[cols], size=2.5) plt.show()
def pair(self,vars=None,save=False): sns.pairplot(self._data[vars]) if save: self._save() else: plt.show() plt.close()
def plot_2D_by_sns(subplot, data, target, target_names): # 转换成pandas格式 # pdata = pd.DataFrame(data, columns=['diss', 'corr', 'h**o', 'energy', 'asm', 'contrast', 'lbp', 'lbp_integrals', '9', '10', '11', '12', '13', '14']) # sns.set(color_codes=True) sns.set() pdata = pd.DataFrame(data) sns.pairplot(data=pdata, hue='species', markers=["o", "s", "D"])
def statistical_analysis(df): """ Check correlation of features to spread """ #correlation matrix corrmat = df.corr() f, ax = plt.subplots(figsize=(12, 9)) hm = sns.heatmap(corrmat, cbar=True, annot=True, square=True, fmt='.2f') plt.yticks(rotation=0) plt.xticks(rotation=90) corrvec = abs(df.corr()['result_spread'].copy()) print corrvec.sort_values() #scatterplot sns.set() cols = ['result_spread','rush_attempt_diff','turn_diff','yards_diff','third_diff','sack_diff','sack_ydiff','p_attempt_diff'] sns.pairplot(df[cols], size = 2.5) # normality_check(df['result_spread']) # normality_check(df['rush_attempt_diff']) # normality_check(df['turn_diff']) # normality_check(df['yards_diff']) # normality_check(df['third_diff']) # normality_check(df['sack_diff']) # normality_check(df['sack_ydiff']) # normality_check(df['poss_diff']) # normality_check(df['p_attempt_diff']) """ Rush attempt shows light tails but otherwise these main features appear normally distributed """
def pairplot(df,hue_name=None): if hue_name is not None: df['target']=hue_name sns.pairplot(df, hue='target') else: sns.pairplot(df)
def do_pairplots(counts, base_dir, sample): """ Produces three pairplots - one for each group and a joint plot. """ markers = ["o", "s"] r, total_gems, assigned_gems, assigned_gems_by_para = assign_gems(counts) df = pd.DataFrame.from_dict(r) unique_gems = find_unique_gems(assigned_gems_by_para) num_unique = len(unique_gems) num_not_unique = len(df) - num_unique unique_bins = ["{:,} unique".format(num_unique) if x in unique_gems else "{:,} not unique".format(num_not_unique) for x in df["GemId"]] df["Unique mappings"] = unique_bins sns_plot = sns.pairplot(df, hue="Unique mappings", markers=markers, plot_kws=dict(s=10)) sns_plot.fig.text(0.87, 0.6, "{:,} Total Gems".format(len(total_gems))) sns_plot.savefig(os.path.join(base_dir, "{}_combined_plot.pdf".format(sample)), format="pdf") # now re-label to simply unique/not unique and make separate pairplots unique_simple_bins = ["Unique" if x in unique_gems else "Not Unique" for x in df["GemId"]] df["Unique mappings"] = unique_simple_bins for i, subset in enumerate(["Unique", "Not Unique"]): df2 = df[df["Unique mappings"] == subset] color = sns.color_palette()[i] cmap = sns.light_palette(color, as_cmap=True) sns_plot = sns.pairplot(df2, markers=markers[i], plot_kws=dict(color=color, s=10)) sns_plot.map_lower(sns.kdeplot, cmap=cmap, n_levels=50) p = subset.replace(" ", "_").lower() sns_plot.savefig(os.path.join(base_dir, "{}_{}_combined_plot.pdf".format(sample, p)), format="pdf") plt.close('all')
def Pairplot(feature_mat,weight=None): '''Plot pairplot for given feature matrix''' if weight == None: sns.pairplot(feature_mat) else: g = sns.pairplot(feature_mat,weight,palette=sns.color_palette("GnBu_d",n_colors=len(feature_mat)),vars=feature_mat.columns.values[:-1]) #g = g.map(plt.scatter) sns.plt.show()
def scatterplot(): import seaborn as sns sns.set() sns.set_context('poster') df = sns.load_dataset("iris") sns.pairplot(df, hue="species", size=2.5) mystyle.printout_plain('multiScatterplot.png')
def demo01(): import matplotlib.pyplot as plt import seaborn as sns sns.set() iris = sns.load_dataset('iris') print(iris.head()) sns.pairplot(iris, hue='species', size=1.5) plt.show()
def feature_correlation(x,filepath=None, visualize=False): """ :param x: """ seaborn.pairplot(x) if visualize: seaborn.plt.show() if not filepath == None: plt.savefig(filepath)
def main(): df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header = None, sep = '\s+') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] print(df.head()) # Select a subset of the features and plot the correlation between features cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'] sns.pairplot(df[cols], size=2.5); plt.title('Correlations between 5 features') plt.show() # Plot a heatmap of the same subset of features cm = np.corrcoef(df[cols].values.T) sns.set(font_scale=2.5) hm = sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', annot_kws = {'size': 15}, yticklabels = cols, xticklabels = cols) plt.show() X = df[['RM']].values y = df['MEDV'].values sc_x = StandardScaler() sc_y = StandardScaler() X_std = sc_x.fit_transform(X) y_std = sc_y.fit_transform(y) lr = LinearRegressionGD() lr.fit(X_std, y_std) plt.plot(range(1, lr.n_iter + 1), lr.cost_) plt.ylabel('SSE') plt.xlabel('Epoch') plt.show() lin_regplot(X_std, y_std, lr) plt.xlabel('Average number of rooms [RM] (standardized)') plt.ylabel('Price in $1000\'s [MEDV] (standardized)') plt.show() # Example classification for a house with 5 rooms num_rooms_std = sc_x.transform([5.0]) price_std = lr.predict(num_rooms_std) print("Price in $1000's: %.3f" % \ sc_y.inverse_transform(price_std))
def seaborn_plot(df,plot_type='pairplot',columns=False): sns.set() mpl.rc("figure", figsize=(16, 8.65)) plotting_df=(df[columns] if columns else df) if plot_type=='pairplot': sns.pairplot(plotting_df) elif plot_type=='corr_plot': sns.corrplot(plotting_df) sns.plt.show() return
def pairplot(df, group="group"): sns.pairplot(data=df.drop('id', axis=1), vars=['age', 'weight', 'heartrate', 'height'], hue=group, diag_kind='kde', size=5, diag_kws=dict(shade=True, linewidth=2), plot_kws=dict(s=50) ) if group == "group": plt.savefig(os.path.join(FIG_PATH, 'pairplot.png'), dpi=100) else: plt.savefig(os.path.join(FIG_PATH, 'pairplot_%s.png' % group), dpi=100)
def main(): train=load_data('../input/train.csv') print train.head(5) # print train.shape # print train.describe() # print pd.isnull(train).any() # print train.mean() train.fillna(train.mean()) sns.set() sns.pairplot(train[["MSSubClass", "MSZoning", "LotFrontage" ]], hue=train[["SalePrice"]]) sns.plt.show()
def pair_plot(metrics): cols = [#'meanRR', #'meanHR', 'SDNN', 'RMSSD', #'peak_VLF', 'peak_LF', 'peak_HF', #'power_VLF', 'power_LF', 'power_HF', 'peak_HF', 'power_LFHF', #'pcpower_VLF', 'pcpower_LF', 'pcpower_HF', #'nupower_LF', 'nupower_HF' ] sns.pairplot(metrics, hue='height', vars=cols)
def correlation_plot(self, logarithmic=True): # plot pairwise parameter correlations with a scatterplot matrix if not self.sampling_finished: raise Exception("Must run .sample() before any output results can be viewed.") if not logarithmic: sns.pairplot(self.posterior_samples) else: df = np.log10(self.posterior_samples.iloc[:,:-1]).dropna() g = sns.PairGrid(df, diag_sharey=False) g.map_lower(sns.kdeplot, cmap="Blues_d") g.map_upper(plt.scatter,alpha=.1) g.map_diag(sns.kdeplot, lw=3)
def distribution(self): # 绘制频数、聚合度、自由度分布 import pandas as pd import matplotlib import matplotlib.pyplot as plt matplotlib.use('TkAgg') import seaborn as sns sns.set(style="white", color_codes=True) df = [] for key, value in self.result.items(): df.append([value['freq'], value['doa'], value['dof']]) df = pd.DataFrame(data=df, columns=['frequency', 'doa', 'dof']) sns.pairplot(df) plt.show()
def plot_by_neighborhood(data, geojson_file): nbr = data.groupby('NEIGHBORHOOD') geonbr = gp.read_file(geojson_file).dropna() geonbr = geonbr.set_index('alias').join(nbr.mean()) geonbr['# of Requests'] = nbr.size() geonbr['Income ($k)'] = geonbr['Median_HHI'] / 1000. cols = ['# of Requests', 'time/SLA ratio', 'Income ($k)'] _, axes = axes_grid(len(cols)) for col, ax in zip(cols, axes.flat): geonbr.plot(column=col, axes=ax) ax.set_title(col) seaborn.pairplot(geonbr[cols])
def main(): df_train=pd.read_csv('../input/train.csv') df_test=pd.read_csv('../input/test.csv') sns.set() sns.pairplot(df_train[["bone_length", "rotting_flesh", "hair_length", "has_soul", "type"]], hue="type") # sns.plt.show() df_train['hair_soul'] = df_train['hair_length'] * df_train['has_soul'] df_train['hair_bone'] = df_train['hair_length'] * df_train['bone_length'] df_train['hair_soul_bone'] = df_train['hair_length'] * df_train['has_soul'] *df_train['bone_length'] df_test['hair_soul'] = df_test['hair_length'] * df_test['has_soul'] df_test['hair_bone'] = df_test['hair_length'] * df_test['bone_length'] df_test['hair_soul_bone'] = df_test['hair_length'] * df_test['has_soul'] * df_test['bone_length'] test_id = df_test['id'] df_train.drop(['id'], axis=1, inplace=True) df_test.drop(['id'], axis=1, inplace=True) df_train.drop(['color'], axis=1, inplace=True) df_test.drop(['color'], axis=1, inplace=True) X_train = df_train.drop('type', axis=1) y_train=df_train['type'] X_train = pd.get_dummies(X_train) df_test_data = pd.get_dummies(df_test) # from sklearn.model_selection import train_test_split # x_train, x_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0) # lr = LogisticRegression(penalty='l2',C=1000000) # lr.fit(X_train,y_train) # y_pred= lr.predict(df_test) # print(classification_report(y_pred,y_test)) # test_results=logistic_regression(X_train,y_train,df_test_data) test_results=run_classifier(X_train,y_train,df_test_data, 'rf') save_result(test_id, test_results,'results_logistic_regression.csv')
def histogram(self,x=None, y=None, l=None, t=None, **kwargs): """ this is a short-cut for creating many possible histograms, at a specified beamline location l, or specified time t. - if x and y are not input, then it creates a full joint-scatterplot for each pair of variables (7 variables total: x,y,z, vx, vy, vz, t) - if x is input, it creates a 1d histogram with respect to that parameter - if x and y are input, creates a 2d histogram with respect to those parameters """ table = self.to_dataframe(l=l, t=t, latex=True) if x is None and y is None: g = sns.pairplot(table, **kwargs) for ax in g.axes.flat: _ = plt.setp( ax.xaxis.get_majorticklabels(), rotation=90) return if x is not None and y is None: x = self._reformat_label(x) sns.distplot(table[x], **kwargs) plt.xlabel(x) return if x is not None and y is not None: x = self._reformat_label(x) y = self._reformat_label(y) sns.jointplot(x=x, y=y, data=table, **kwargs); return
def plot_shapes(summary, save=False, condition='Condition', context='notebook'): """Plot and print area and volume of all steps and averaged over track""" columns_of_interest = ['Scan. Area/Step', 'Scan. Vol./Step', 'Mean Surface Area (µm2)', 'Mean Volume (µm3)', 'Mean Sphericity', condition] sns.set(style='white', context=context) sns.pairplot(summary[columns_of_interest], hue=condition, diag_kind='kde') plt.tight_layout() if save: conditions = [cond.replace('= ', '') for cond in summary[condition].unique()] plt.savefig('Shapes_' + '-'.join(conditions), dpi=300) else: plt.show()
def visualize_hist_pairplot(X,y,selected_feature1,selected_feature2,features,diag_kind): """ Visualize the pairwise relationships (Histograms and Density Funcions) between classes and respective attributes Keyword arguments: X -- The feature vectors y -- The target vector selected_feature1 - First feature selected_feature1 - Second feature diag_kind -- Type of plot in the diagonal (Histogram or Density Function) """ #create data joint_data=np.column_stack((X,y)) column_names=features #create dataframe df=pd.DataFrame(data=joint_data,columns=column_names) #plot palette = sea.hls_palette() splot=sea.pairplot(df, hue="Y", palette={0:palette[2],1:palette[0]},vars=[selected_feature1,selected_feature2],diag_kind=diag_kind) splot.fig.suptitle('Pairwise relationship: '+selected_feature1+" vs "+selected_feature2) splot.set(xticklabels=[]) # plt.subplots_adjust(right=0.94, top=0.94) #save fig output_dir = "img" save_fig(output_dir,'{}/{}_{}_hist_pairplot.png'.format(output_dir,selected_feature1,selected_feature2))
#See the distrubution of the data sns.distplot(data['charges'],ax= ax[0,0]) sns.distplot(data['age'],ax=ax[0,1]) sns.distplot(data['bmi'],ax= ax[1,0]) sns.distplot(data['children'],ax= ax[1,1]) sns.countplot(data['sex'],ax=ax[2,0]) sns.countplot(data['smoker'],ax= ax[2,1]) sns.countplot(data['region'],ax= ax[3,0]) #visualizeing skewness sns.pairplot(data) #Lets look at smokers vs non-smokers on age vs charges: sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'muted', height = 7) plt.show(sns) #Lets look at correlation: corr = data.corr() sns.heatmap(corr, cmap = 'Wistia', annot= True) plt.show(sns) ############################################01_04_ConvertCategoricalDataintoNumbers############################################## #option0: pandas factorizing: maps each category to a different integer = label encoder
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # chapter05.py #%% import seaborn as sns iris = sns.load_dataset('iris') iris.head() #%% sns.set() sns.pairplot(iris, hue='species', height=1.5) #%% X_iris = iris.drop('species', axis=1) X_iris.shape #%% y_iris = iris['species'] y_iris.shape #%% # Suppervised learning example: Simple linear regression import matplotlib.pyplot as plt import numpy as np plt.figure() rng = np.random.RandomState(42) x = 10 * rng.rand(50) y = 2 * x - 1 + rng.randn(50) plt.scatter(x, y)
@author: Vineeta """ import tkinter import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import pandas as pd data=pd.read_csv("sales_data.csv",encoding='latin1') data.shape data.describe() data.isnull().sum() data = data.dropna(axis=1) print(data) sns.pairplot(data,x_vars=['QUANTITYORDERED','PRICEEACH','MSRP'], y_vars='SALES',height=4, aspect=1,kind='scatter') plt.show() import pandas as pd from sklearn import linear_model X = data[['QUANTITYORDERED','PRICEEACH','MSRP']] Y = data['SALES'] regr = linear_model.LinearRegression() regr.fit(X, Y) QUANTITYORDERED = 56 PRICEEACH = 93.2 MSRP=150 print ('Predicted SalesPrice: \n', regr.predict([[QUANTITYORDERED,PRICEEACH,MSRP]])) import tkinter as tk root = tkinter.Tk() canvas1 = tk.Canvas(root, width = 500, height = 300) canvas1.pack()
DATA = load(fn) COST_MODEL, TRACE = DATA['model'], DATA['trace'] ######################## # Model visualization ######################## #Plot Cost model KDE (ln x scale) FIG2, _ = plt.subplots(1, 1, figsize=(13, 6)) plt.title(f'KDE Workstaion Cost {MODEL_PREFIX} Observation versus Model', fontsize=16) kdeplot(log(Y), label='Observation') kdeplot(log(Y_), label='Model') plt.xlabel('Wrks Cost Ln()', fontsize=16) plt.ylabel('Density', fontsize=16) FIG2.savefig(f'Cost_model_KDE_{MODEL_PREFIX}_{F_BASENAME}.png') #Plot TIERS visualization relationships for tier in TIERS: cols = list(ATT) + [MEASURE, 'model_cost'] ppp = log(PP[cols]).copy() ppp = concat([ppp, PP[tier]], axis=1) tvrf_name = f'Visualizing relationships-{MODEL_PREFIX}-{tier}-{F_BASENAME}.png' pairplot(ppp, hue=tier, height=3, kind='scatter').savefig(tvrf_name) pairplot(ppp, height=3, kind='scatter', diag_kind='kde').savefig( f'Visualizing relationships-{MODEL_PREFIX}-{F_BASENAME}.png') SUMMARY = df_summary(TRACE) print(SUMMARY)
# In[3]: #描述性分析 s = data[['temperature','pressure', 'windspeed','electricity_consumption']] s.describe() # In[4]: #散佈圖 sns.set(style='whitegrid', context='notebook') cols = ['temperature', 'pressure', 'windspeed', 'electricity_consumption'] sns.pairplot(data[cols], size=2.5); plt.tight_layout() plt.show() # In[5]: #根據用電量分類 def get_consumption_category(wt): if wt < 200: return "<200kWh" elif 200 < wt < 400: return "200kWh~400kWh" elif 400 < wt < 600: return "400kWh~600kWh"
}) school_palette = { "Most cited analytical": "tab:red", "Anti-teorethical": "tab:blue", "Master": "tab:green", "Cavell": "tab:orange", } #%% g = sns.pairplot( df, x_vars=["parentheses_ratio", "dot_parentheses_ratio", "r2"], y_vars=["author"], hue="school", height=5, aspect=0.6, diag_kind=None, markers=["H", "s", "o", "D"], plot_kws={"s": 50}, palette=school_palette, ) g._legend.remove() g.add_legend( bbox_to_anchor=(0.37, 0.24), frameon=True, label_order=[ "Cavell", "Master", "Anti-teorethical", "Most cited analytical" ], ) g._legend.set_title(None) g.axes.flatten()[0].set_ylabel("")
### Import and Global vars ### # Commonly used module import numpy as np import pandas as pd # For data visualization import matplotlib.pyplot as plt import seaborn as sns iris_data = pd.read_csv("./preprocessed_iris.csv") # In[2]: g = sns.pairplot(iris_data, hue="class", diag_kind="kde") g.savefig('iris_output.png') # In[3]: google_data = pd.read_csv("./preprocessed_googleplaystore.csv", usecols=[ "Category", "Rating", "Reviews", "Installs", "Price", "Last Updated" ]) cat_list = list(google_data["Category"].unique()) replace_list = list(range(0, len(cat_list))) mymap = dict(zip(cat_list, replace_list)) google_data = google_data.applymap(lambda s: mymap.get(s) if s in mymap else s)
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split # Load iris dataset iris = pd.read_csv("..\datasets\iris.csv") iris['variety'] = iris['variety'].astype('category') # EDA print(iris.info()) print(iris.groupby('variety').size()) print(iris.describe(include='all')) sns.pairplot(iris, hue="variety") sns.lmplot(x='petal.length', y='petal.width', data=iris, hue="variety", fit_reg=False)
base.shape base.isnull().values.any() base base.info() base.describe() sns.countplot(x = 'Outcome', data = base); base.hist(figsize=(20,12)); sns.pairplot(base, hue = 'Outcome', vars = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']); sns.heatmap(base.corr(), annot = True); X = base.iloc[:, 0:8].values X y = base.iloc[:, 8].values y from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X)
import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression """Loading the dataset""" raw_data = pd.read_csv('drive/My Drive/data.csv', index_col=0) raw_data.head() """**Exploratory Data Analysis**""" # Checking for missing values raw_data.isnull().sum() """Bivariate Analysis""" sns.pairplot(raw_data, hue='is_goal') plt.show() """Missing values treatment""" raw_data.team_id.value_counts() # drop columns which do not contribute to predictions raw_data.drop('match_event_id', inplace=True, axis=1) raw_data.drop('team_name', inplace=True, axis=1) raw_data.drop('date_of_game', inplace=True, axis=1) raw_data.drop('lat/lng', inplace=True, axis=1) raw_data.drop('team_id', inplace=True, axis=1) raw_data.shape # Filling missing shot_id_numbers
# 处理类别型数据,其中origin列代表了类别1,2,3,分布代表产地:美国、欧洲、日本 # 其弹出这一列 origin = dataset.pop('Origin') # 根据origin列来写入新列 dataset['USA'] = (origin == 1) * 1.0 dataset['Europe'] = (origin == 2) * 1.0 dataset['Japan'] = (origin == 3) * 1.0 dataset.tail() # 切分为训练集和测试集 train_dataset = dataset.sample(frac=0.8, random_state=0) test_dataset = dataset.drop(train_dataset.index) # %% 统计数据 sns.pairplot(train_dataset[["Cylinders", "Displacement", "Weight", "MPG"]], diag_kind="kde") # %% # 查看训练集的输入X的统计数据 train_stats = train_dataset.describe() train_stats.pop("MPG") train_stats = train_stats.transpose() train_stats # 移动MPG油耗效能这一列为真实标签Y train_labels = train_dataset.pop('MPG') test_labels = test_dataset.pop('MPG') # 标准化数据 def norm(x): return (x - train_stats['mean']) / train_stats['std']
ax.plot(x, y, 'o', markersize=1, label="data") ax.plot(x, slm.fittedvalues, 'b--.', label="OLS") ax.plot(x, iv_u, 'r--') ax.plot(x, iv_l, 'r--') ax.set_xlim([0, 50000]) ax.legend(loc='best') # We do the same thing for variables in another way using seaborns. # In[17]: import seaborn as sns sns.pairplot( csv_data, x_vars=['LotArea', 'GrLivArea', 'YearBuilt', 'FullBath', '2ndFlrSF'], y_vars='SalePrice', height=7, aspect=0.7, kind='reg') # This is the same last few steps for other variables. The difference between SST and SSE is the improvement in prediction from the regression model, compared to the mean model. Dividing that difference by SST gives R-squared. It is the proportional improvement in prediction from the regression model, compared to the mean model. It indicates the goodness of fit of the model. R-squared has the useful property that its scale is intuitive: it ranges from zero to one, with zero indicating that the proposed model does not improve prediction over the mean model, and one indicating perfect prediction. Improvement in the regression model results in proportional increases in R-squared. One pitfall of R-squared is that it can only increase as predictors are added to the regression model. This increase is artificial when predictors are not actually improving the model’s fit. To remedy this, a related statistic, Adjusted R-squared, incorporates the model’s degrees of freedom. Adjusted R-squared will decrease as predictors are added if the increase in model fit does not make up for the loss of degrees of freedom. Likewise, it will increase as predictors are added if the increase in model fit is worthwhile. Adjusted R-squared should always be used with models with more than one predictor variable. It is interpreted as the proportion of total variance that is explained by the model. There are situations in which a high R-squared is not necessary or relevant. When the interest is in the relationship between variables, not in prediction, the R-square is less important. An example is a study on how religiosity affects health outcomes. A good result is a reliable relationship between religiosity and health. No one would expect that religion explains a high percentage of the variation in health, as health is affected by many other factors. Even if the model accounts for other variables known to affect health, such as income and age, an R-squared in the range of 0.10 to 0.15 is reasonable. # ## Multi-Variable Linear Regression # # In this part we try to fit the regerssion line using 6 variables. Our candidates are 'LotArea', 'GrLivArea', 'LotFrontage', '2ndFlrSF', 'YearBuilt', 'FullBath'. These are elected using correlation with 'SalesPrice' variable. Variables with higher correlation would probably be more suitable to use for regression problem. Following code will plot the regerssion line for each of these variables compared to 'SalePrice'. # # In[27]: # using statistic model for variables x_tot = csv_data[[
def draw_func(self): if len(self.all_data.index) == 0: self.show_message('请导入采样数据') elif len(self.info_data.index) == 0: self.show_message('请导入采样信息') elif not self.region_linked: self.show_message("请点击链接") elif not self.figure_able: self.show_message("数据包含非数值类型,不可画图!") else: self.clear_func() self.cur_slice() # 解决无法显示中文 plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # plt.rcParams['font.sans-serif']=['SimHei'] #指定默认字体,SimHei为黑体 # 解决无法显示负号 plt.rcParams['axes.unicode_minus'] = False plt.title(self.figure_type) if self.figure_type == "主成分分析": region_data = self.cur_data.iloc[:, 0].values.tolist() print(region_data) regions = list(set(region_data)) print(regions) region_color = [(int(regions.index(i) * 255 / len(regions))) for i in region_data] # region_color = [regions.index[i] for i in region_data] print(region_color) data = self.cur_data.iloc[:, 1:].values data = data - np.mean(data, axis=0) print("data",data.shape) cov_mat = np.cov(data, rowvar=0) print("cov:", cov_mat.shape) eig_vals, eig_vects = np.linalg.eig(np.mat(cov_mat)) low_data_mat = data * eig_vects print("low:", low_data_mat.shape) eig_val_indice = np.argsort(eig_vals) top = 2 n_eig_val_indice = range(top) print("n_eig_val_indice", n_eig_val_indice) n_eig_vects = eig_vects[:, n_eig_val_indice] print("n_eig:",n_eig_vects.shape) recon_mat = (low_data_mat * eig_vects) + np.mean(data, axis=0) print("rec:", recon_mat.shape) x = np.array(low_data_mat)[:, 0] y = np.array(low_data_mat)[:, 1] # z = np.array(low_data_mat)[:, 2] for region in regions: index = [i for i, data in enumerate(region_data) if data == region] plt.scatter(x[index], y[index]) plt.legend(regions) elif self.figure_type == '平行坐标图': parallel_coordinates(self.cur_data, self.region_method) elif self.figure_type == "Andrews图": colors = ['b', 'g', 'r', 'orange'] andrews_curves(self.cur_data, self.region_method, color=colors) elif self.figure_type == 'Radiv图': radviz(self.cur_data, self.region_method) elif self.figure_type == '矩阵散点图': print("绘制矩阵散点图") sns.pairplot(data=self.cur_data, hue=self.region_method) f = plt.gcf() self.ax = f self.canvas = FigureCanvas(f) elif self.figure_type == 'Chernoff脸谱图': self.cur_data.to_excel('cur_data.xlsx') print("data out") # goto_r() os.system("python ./PyToR.py") face_info = pd.read_csv('face_info.csv') # f_str = face_info.to_string() font = {'weight': 'normal', 'size': 11, } plt.text(500, 0 , "脸谱图条目 数据列", fontdict=font) for index, row in face_info.iterrows(): f_str = row[0] + " : " plt.text(500, 20 + 20 * index, f_str, fontdict=font) f_str = row[1] plt.text(650, 30 + 20 * index, f_str, fontdict=font) plt.imshow(Image.open('face.png')) plt.gca().add_patch(plt.Rectangle(xy=(500, 20), width=100, height=300, edgecolor=[1, 1, 1], fill=False, linewidth=2)) # print("文件命名为:face.jpg") # info=pd.read_csv('face_info.csv',encoding='GBK') # print("effect of variables:\n{}".format(info)) self.table_view.setVisible(False) self.canvas.setVisible(True) self.figure_layout.removeWidget(self.table_view) self.figure_layout.addWidget(self.canvas) self.canvas.draw() self.figure_state = 2
g = sns.catplot(x="Rated 4.4 or more", y="Reviews", data=df) g.savefig('rated4.4ormore-reviews.png') df = df.drop('Rated 4.4 or more', axis=1) corr = df.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio g = sns.heatmap(corr, cmap=cmap, mask=mask, annot=True, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}) g.figure.savefig('heatmap.png') g = sns.pairplot(df) g.savefig('pairwise.png')
mask[np.triu_indices_from(mask)] = True with sns.axes_style("white"): sns.heatmap(data=corr, mask=mask, annot=True, vmin=-1, vmax=1) # In[78]: plot_heatmap(df) # In[79]: cols = ['temp', 'atemp', 'windspeed', 'humidity'] pp = sns.pairplot(df[cols], diag_kws=dict(shade=True), diag_kind="kde", kind="reg") fig = pp.fig fig.subplots_adjust(top=0.93, wspace=0.3) fig.suptitle('Correlação das variáveis numéricas', fontsize=14, fontweight='bold') # In[80]: sns.boxplot(data=df[['temp', 'atemp', 'humidity', 'windspeed', 'count']], orient='h') fig = plt.gcf() fig.set_size_inches(12, 6) fig.suptitle('Análise de Outliers', fontsize=14, fontweight='bold')
def print_ica_plot(comp, scaled_data): ica = FastICA(n_components=comp) ica_fit = ica.fit_transform(scaled_data) ica_df = pd.DataFrame(ica_fit) sns.pairplot(ica_df)
def visualization(self): """ 接口请求参数 "tableName": "advertising", # str,数据库表名 "X": ["TV", "radio", "newspaper"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段 "Y": ["sales"], # list,因变量,当表格方向为v是使用 "show_options": ["y_count", "pairs", "corr", "y_corr"], # 展示选项 "x_count": [], # list,选择要展示频率分布直方图的自变量 "box": [], # list,选择要展示箱型图的自变量 :return: """ try: res = [] self.table_data = self.table_data.astype("float") data = self.table_data.describe() res.append( transform_table_data_to_html({ "data": data.values.tolist(), "title": "描述性统计分析", "col": data.columns.tolist(), "row": data.index.tolist() })) if self.config.get("x_count") and self.config.get("x_count")[0]: for x in self.config["x_count"]: sns.distplot(self.table_data[x], kde=False) # 显示纵轴标签 plt.ylabel("frequency") # 显示图标题 # plt.title("{} - frequency distribution histogram".format(x)) res.append({ "title": "{} - 频率分布".format(x), "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "y_count" in self.config["show_options"]: sns.distplot(self.table_data[self.config["Y"][0]], kde=False) # 显示横轴标签 plt.xlabel("section") # 显示纵轴标签 plt.ylabel("frequency") # 显示图标题 # plt.title("y frequency distribution histogram") res.append({ "title": "{} - 频率分布".format(self.config["Y"][0]), "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if self.config.get("box") and self.config.get("box")[0]: for x in self.config["box"]: sns.boxplot(self.table_data[x], palette="Set2", orient="v") # 显示图标题 # plt.title("{} - Box distribution to check outliers".format(x)) res.append({ "title": "{} - 箱型图".format(x), "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "pairs" in self.config["show_options"]: sns.pairplot(self.table_data) # plt.title("Variable relation in pairs") res.append({ "title": "变量两两关系图", "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "corr" in self.config["show_options"]: corr = self.table_data.corr() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=0.2, cmap="YlGnBu", annot=True) # plt.title("Correlation between variables") res.append({ "title": "相关系数图", "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "y_corr" in self.config["show_options"]: self.table_data.corr()[self.config["Y"][0]].sort_values( ascending=False).plot(kind='bar') # plt.title("Correlations between y and x") res.append({ "title": "因变量和各自变量的相关系数图", "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) response_data = {"res": res, "code": "200", "msg": "ok!"} return response_data except Exception as e: return {"data": "", "code": "500", "msg": "{}".format(e.args)}
import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns dataset = pd.read_csv('studentperformance.csv') dataset.columns = [ 'gender', 'race', 'ped', 'lunch', 'test', 'math', 'reading', 'writing' ] dataset.info() dataset.describe() pd.plotting.scatter_matrix(dataset) sns.pairplot(dataset) sns.barplot(dataset['gender'].value_counts().index, dataset['gender'].value_counts(), hue=['female', 'male']) sns.barplot(dataset['race'], dataset['math'], hue=dataset['gender']) sns.barplot(dataset['race'], dataset['reading'], hue=dataset['gender']) sns.barplot(dataset['race'], dataset['writing'], hue=dataset['gender']) sns.barplot(dataset['ped'], dataset['math'], hue=dataset['gender']) sns.barplot(dataset['ped'], dataset['reading'], hue=dataset['gender']) sns.barplot(dataset['ped'], dataset['writing'], hue=dataset['gender'])
target_column = "Species" # %% [markdown] # Let's check the dataset more into details. # %% penguins.head() # %% [markdown] # Since that we have few samples, we can check a scatter plot to observe the # samples distribution. # %% import seaborn as sns pairplot_figure = sns.pairplot(penguins, hue="Species") pairplot_figure.fig.set_size_inches(9, 6.5) # %% [markdown] # First let's check the feature distributions by looking at the diagonal plots # of the pairplot. We can deduce the following intuitions: # # * The Adelie species can be differentiated from the Gentoo and Chinstrap # species depending on the culmen length; # * The Gentoo species can be differentiated from the Adelie and Chinstrap # species depending on the culmen depth. # # ## Regression dataset # # In a regression setting, the target is a continuous variable instead of # categories. Here, we use two features of the dataset to make such a problem:
def main(): photoData = PhotoData.Photo() lists = [] header = [] cols = [] # Set initial data photoDirectories = [] ignoreCameraModel = [] fillEmptyLensModel = { "DSC-RX100M3": "No Lens Data", } # sample PlotBarRotate = {} # Set plot initial data plotFixSizeX = 12 plotFixSizeY = 8 plotGrid = False plotSubPlots = False plotFontSize = 10 plotRotate = 0 plotBar = False plotScatter = False plotHexbin = False plotPie = False for line in open('PhotoDataAnalysis.ini', 'r'): line = line.strip() if line == "": pass elif line[0:1] == "#": pass else: item, param = line.split("=", 1) if item == "PhotoDirectory": photoDirectories.append(param) if item == "IgnoreCameraModel": ignoreCameraModel.append(param) if item == "FillEmptyLensModel": param1, param2 = param.split(":", 1) fillEmptyLensModel[param1] = param2.strip() if item == "PlotFigSizeX": plotFixSizeX = int(param) if item == "PlotFigSizeY": plotFixSizeY = int(param) if item == "PlotGrid": if param.lower() == "true": plotGrid = True else: plotGrid = False if item == "PlotSubPlots": if param.lower() == "true": plotSubPlots = True else: plotSubPlots = False if item == "PlotsFontSize": plotFontSize = param # Default値 if item == "PlotBarRotate": param1, param2 = param.split(":") PlotBarRotate[param1] = int(param2) if param1 == "Default": plotRotate = param2 if item == "PlotBar": if param == "True": plotBar = True else: plotBar = False if item == "PlotScatter": if param == "True": plotScatter = True else: PlotScatter = False if item == "PlotHexbin": if param == "True": plotHexbin = True else: plotHexbin = False if item == "PlotPie": if param == "True": plotPie = True else: plotPie = False if item == "PlotSeaborn": if param == "True": plotSeaborn = True else: plotSeaborn = False print("#") print("#") print("#") print("# Load and analyze photo data") print("#") print("#") print("#") print("> Load photo data") for photoDirectory in photoDirectories: lists, header = GetExifData(lists=lists, ignoreCameraModel=ignoreCameraModel, fileFullPath=photoDirectory) if header != []: cols = header now = datetime.datetime.now() now_dt = str(now.year).zfill(2) + str(now.month).zfill(2) + str( now.day).zfill(2) + " " + str(now.hour).zfill(2) + str( now.minute).zfill(2) + str(now.second).zfill(2) # Data保存先ディレクトリ if os.path.exists("./Data") == False: os.mkdir("./Data") dataDir = "./Data/" + now_dt os.mkdir(dataDir) # # このエラーが出た時の確認用: AssertionError: 45 columns passed, passed data had 43 columns # #for x in lists: # print(str(len(x)) + ":" + x[0] ) # - - - # Set PANDAS # - - - df = pd.DataFrame(data=lists, columns=cols) df = df.applymap(illegal_char_remover) df["Count"] = df.apply(lambda x: 1, axis=1) #iPhoneのMakeがBlankの時に更新 df["Make"] = df.apply(lambda x: "Apple" if x.Model[0:6] == "iPhone" else x.Make, axis=1) #LensModelがEmptyの時に更新 df["LensModel_org"] = df.apply(lambda x: x.LensModel, axis=1) # Backup df["LensModel"] = df.apply(lambda x: "Unknown" if x.LensModel == "" else x.LensModel, axis=1) df["LensModel"] = df.apply(lambda x: fillEmptyLensModel[x.Model] if x.Model in fillEmptyLensModel and x.LensModel == "Unknown" else x.LensModel, axis=1) # EXIF Dataの数値がブランクのものを0で埋める df["ISOSpeedRatings"] = df.apply( lambda x: 0 if x.ISOSpeedRatings == "" else x.ISOSpeedRatings, axis=1) df["FNumber_cust"] = df.apply(lambda x: 0 if x.FNumber_cust == "" else x.FNumber_cust, axis=1) df["ExposureTime_calc"] = df.apply( lambda x: 0 if x.ExposureTime_calc == "" else x.ExposureTime_calc, axis=1) df["ExposureTime_cust"] = df.apply( lambda x: 0 if x.ExposureTime_cust == "" else x.ExposureTime_cust, axis=1) df["FocalLengthIn35mmFilm"] = df.apply( lambda x: 0 if x.FocalLengthIn35mmFilm == "" else x.FocalLengthIn35mmFilm, axis=1) df["LightSource_cust"] = df.apply( lambda x: "Auto" if x.LightSource_cust == 0 else x.LightSource_cust, axis=1) # 年。月、曜日、時間帯の列を追加 df["Year"] = df.apply(lambda x: GetYear(x.DateTimeOriginal), axis=1) df["Month"] = df.apply(lambda x: GetMonth(x.DateTimeOriginal), axis=1) df["Year_Month"] = df.apply(lambda x: GetYearMonth(x.DateTimeOriginal), axis=1) df["Hour"] = df.apply(lambda x: GetHour(x.DateTimeOriginal), axis=1) df["Week"] = df.apply(lambda x: GetWeek(x.DateTimeOriginal), axis=1) # List Camera Model print("[Camera Model List]") pvt_cm = pd.pivot_table(df, values="Count", index=["Make", "Model"], aggfunc=lambda x: len(x)) print(pvt_cm) # - - - # Create charts # - - - print("[Create Charts]") # Sort of bar chart plots_bar = [ ["FNumber_cust", ["Make", "Model"]], ["FNumber_cust", ["Make", "Model", "LensModel"]], ["FocalLengthIn35mmFilm", ["Make", "Model"]], ["FocalLengthIn35mmFilm", ["Make", "Model", "LensModel"]], ["FocalLength_cust", ["Make", "Model"]], ["FocalLength_cust", ["Make", "Model", "LensModel"]], ["ShutterSpeed_calc", ["Make", "Model"]], ["ShutterSpeed_calc", ["Make", "Model", "LensModel"]], ["ISOSpeedRatings", ["Make", "Model"]], ["ISOSpeedRatings", ["Make", "Model", "LensModel"]], ["Orientation_cust", ["Make", "Model"]], ["Orientation_cust", ["Make", "Model", "LensModel"]], ["LightSource_cust", ["Make", "Model"]], ["LightSource_cust", ["Make", "Model", "LensModel"]], ["MeteringMode_cust", ["Make", "Model"]], ["MeteringMode_cust", ["Make", "Model", "LensModel"]], ["ApertureValue_cust", ["Make", "Model"]], ["ApertureValue_cust", ["Make", "Model", "LensModel"]], ["BrightnessValue_cust", ["Make", "Model"]], ["BrightnessValue_cust", ["Make", "Model", "LensModel"]], ["ExposureBiasValue_cust", ["Make", "Model"]], ["ExposureBiasValue_cust", ["Make", "Model", "LensModel"]], ["MaxApertureValue_cust", ["Make", "Model"]], ["MaxApertureValue_cust", ["Make", "Model", "LensModel"]], ["Sharpness_cust", ["Make", "Model"]], ["Sharpness_cust", ["Make", "Model", "LensModel"]], ["SceneCaptureType_cust", ["Make", "Model"]], ["SceneCaptureType_cust", ["Make", "Model", "LensModel"]], ["Make", ""], ["Model", ""], ["LensModel", ""], ["Year", ["Make", "Model"]], ["Year", ["Make", "Model", "LensModel"]], ["Year_Month", ["Make", "Model"]], ["Year_Month", ["Make", "Model", "LensModel"]], ["Month", ["Make", "Model"]], ["Month", ["Make", "Model", "LensModel"]], ["Hour", ["Make", "Model"]], ["Hour", ["Make", "Model", "LensModel"]], ["Week", ["Make", "Model"]], ["Week", ["Make", "Model", "LensModel"]], ["Hour", "ISOSpeedRatings"], ["Hour", "FNumber_cust"], ["Hour", "ShutterSpeed_cust"], ["FNumber_cust", "ISOSpeedRatings"], #["ExposureTime_calc", ""], # [["FNumber_cust", "ISOSpeedRatings"], ""], ### このパターンはエラー ] # Sort of Scatter chart plots_scatter = [ ["ExposureTime_calc", "FNumber_cust"], ["Hour", "FNumber_cust"], ["Hour", "ISOSpeedRatings"], ["ISOSpeedRatings", "FNumber_cust"], ["FocalLengthIn35mmFilm", "FNumber_cust"], ["FocalLength_cust", "FNumber_cust"], ["ApertureValue_cust", "FNumber_cust"], ["ShutterSpeed_calc", "FNumber_cust"], ["ShutterSpeed_calc", "ISOSpeedRatings"], ["ShutterSpeed_calc", "ExposureTime_calc"], ] # Sort of Pie chart plots_pie = [ "Make", "Model", "LensModel", "FocalLengthIn35mmFilm", "FocalLength_cust", "FNumber_cust", "ISOSpeedRatings", "Year", "Month", "Week", ] # Pivot TableをEXCELに書き出し saveExcelFile = dataDir + "/Photo Data " + now_dt + ".xlsx" writer = pd.ExcelWriter(saveExcelFile) df.to_excel(writer, sheet_name=now_dt) # - - - - - - - - - - # Plots Bar Chart作成 # - - - - - - - - - - if plotBar == True: for idx, clm in plots_bar: if clm != "": clm2 = "" ds = pd.pivot_table(df, values="Count", index=idx, columns=clm, aggfunc=lambda x: len(x)) if isinstance(clm, list): n = 0 for val in clm: n += 1 if n == 1: clm2 = val else: clm2 = clm2 + " & " + val else: clm2 = clm idx = ModifyName(idx) # Pivotを作成した後に名称変更 fn = idx + " by " + ModifyName(clm2) pTitle = "x: " + ModifyName(idx) + " | y: " + ModifyName(clm2) lgd = True else: ds = pd.pivot_table(df, values="Count", index=idx, aggfunc=lambda x: len(x)) idx = ModifyName(idx) # Pivotを作成した後に名称変更 fn = idx pTitle = idx lgd = False # 長い名称を縮小 fn = ReduceName(fn) # Write to EXCEL File ds.to_excel(writer, sheet_name=fn.replace("&", "_")) # Draw plot if idx in PlotBarRotate: rotate = PlotBarRotate[idx] #Rotateを個別に設定している場合 else: rotate = plotRotate fontSize = 8 ds.columns.name = "" ds.index.name = "" ds.plot(kind="bar", title=pTitle, grid=plotGrid, legend=lgd, subplots=plotSubPlots, fontsize=plotFontSize, rot=rotate, figsize=(plotFixSizeX, plotFixSizeY), stacked=True) saveFile = dataDir + "/Pd_" + fn + ".png" print("> Plot:" + saveFile) plt.savefig(saveFile) plt.close() else: pass # - - - - - - - - - - # Plots scatter/hexbin chart作成 # - - - - - - - - - - for val_x, val_y in plots_scatter: pTx = ModifyName(val_x) pTy = ModifyName(val_y) pTitle = str(pTx) + " vs " + str(pTy) # 長い名称を縮小 fn = ReduceName(pTitle) if plotScatter == True: # Scatter Chart df.plot( kind='scatter', x=val_x, y=val_y, linewidth="2", c="blue", edgecolors="blue", title=pTitle, grid=plotGrid, legend=lgd, subplots=plotSubPlots, fontsize=plotFontSize, #rot=plotRotate, figsize=(plotFixSizeX, plotFixSizeY), stacked=True) plt.xlabel(pTx) plt.ylabel(pTy) saveFile = dataDir + "/Ps_" + fn + ".png" print("> Plot:" + saveFile) plt.savefig(saveFile) plt.close() else: pass if plotHexbin == True: # Hexbin Chart df.plot( kind='hexbin', x=val_x, y=val_y, gridsize=30, marginals=False, cmap=cm.PuBu, title=pTitle, grid=plotGrid, legend=lgd, subplots=plotSubPlots, fontsize=plotFontSize, #rot=plotRotate, figsize=(plotFixSizeX, plotFixSizeY), stacked=True) plt.xlabel(pTx) plt.ylabel(pTy) saveFile = dataDir + "/Ph_" + fn + ".png" print("> Plot:" + saveFile) plt.savefig(saveFile) plt.close() else: pass # - - - - - - - - - - # Plots pie chart # - - - - - - - - - - if plotPie == True: for idx in plots_pie: try: pTitle = ModifyName(idx) ds = pd.pivot_table(df, values="Count", index=idx, aggfunc=lambda x: len(x)) ds.plot(kind="pie", y="Count", subplots=True, title=pTitle, autopct='%.1f', figsize=(plotFixSizeX, plotFixSizeY), counterclock=False, startangle=90, pctdistance=0.8) plt.ylabel("") fn = ModifyName(idx) # 長い名称を縮小 fn = ReduceName(fn) saveFile = dataDir + "/Pp_" + str(fn) + ".png" print("> Plot:" + saveFile) plt.axis('equal') plt.savefig(saveFile) plt.close() except AssertionError as err: print("*EXCEPTION:", err) else: pass # - - - - - - - - - - # Seaborn Chart # - - - - - - - - - - if plotSeaborn == True: # Seaborn PairPlot #1 df_select = df.loc[:, [ "FocalLength_cust", "FNumber_cust", "ShutterSpeed_calc", "ISOSpeedRatings", "ApertureValue_cust", "ExposureBiasValue_cust", "LensModel" ]] sb = sns.pairplot(df_select, hue="LensModel") saveFile = dataDir + "/Sp_Pairplot1.png" plt.savefig(saveFile) plt.close() # Seaborn PairPlot #2 df_select = df.loc[:, [ "Hour", "FocalLength_cust", "FNumber_cust", "ShutterSpeed_calc", "ISOSpeedRatings", "LensModel" ]] sb = sns.pairplot(df_select, hue="LensModel") saveFile = dataDir + "/Sp_Pairplot2.png" plt.savefig(saveFile) plt.close() # Seaborn JpintPlot (using scatter param) for val_x, val_y in plots_scatter: pTx = ModifyName(val_x) pTy = ModifyName(val_y) pTitle = str(pTx) + " vs " + str(pTy) sb_kind = "hex" # reg, kde, hex sb = sns.jointplot(val_x, val_y, df, kind=sb_kind) # 長い名称を縮小 fn = ReduceName(pTitle) saveFile = dataDir + "/Sj_" + str(fn) + ".png" print("> Plot:" + saveFile) plt.savefig(saveFile) plt.close() # Seaborn HeatMap # sb = sns.heatmap(df.corr()) else: pass # 最後にSave writer.save() print("> Saved EXCEL file: " + saveExcelFile)
df['Current_Year'] = 2020 df['Age_of_Car'] = df['Current_Year'] - df['Year'] df.head() df = df.drop(['Car_Name', 'Year', 'Current_Year'], axis=1) df.head() df = pd.get_dummies(df, drop_first=True) df.head() sns.pairplot(df) corr = df.corr() top_features = corr.index plt.figure(figsize=(12, 6)) sns.heatmap(corr, annot=True, cmap='RdYlGn') x = df.iloc[:, 1:] y = df.iloc[:, 0] x.head() y.head() #Feature Im[ortance
color='lightgreen') plt.show(bar) # In[24]: #Shohim varshmërinë e atributeve me njëra tjetrën x = data_file['Rating'].dropna() y = data_file['Size'].dropna() z = data_file['Installs'][data_file.Installs != 0].dropna() p = data_file['Reviews'][data_file.Reviews != 0].dropna() t = data_file['Type'].dropna() price = data_file['Price'] p = sns.pairplot(pd.DataFrame( list(zip(x, y, np.log(z), np.log10(p), t, price)), columns=['Rating', 'Size', 'Installs', 'Reviews', 'Type', 'Price']), hue='Type', palette="Set2") # In[25]: #Shohim konvergjencën e vlerave për atributin Rating data_file.hist(column='Rating') plt.ylim(0, 10841) plt.title("Shpërndarja e Rating") plt.xlabel("Vlera e Rating") plt.ylabel("Nr. i aplikacioneve") # In[26]: #Enkodimi i atributit App
def pairplots(trainA): sns.set(style="ticks", color_codes=True) sns.pairplot(trainA, diag_kind='kde') plt.show()
for c in columns: if 'mep' in c: idx = int(c[-1]) - 1 data = np.log(mat['AmpsMclean'][()][idx]) #elif 'amplitude' in c: # data = np.log(mat[c][()][idx]) else: data = mat[c][()][idx] vector.append(data) vector = np.vstack(vector) df = pd.DataFrame(vector.T, columns=columns) sns.pairplot(df, diag_kws=diag_kws, plot_kws=plot_kws) mat.close() ############################################################################## task = 'phastimate' threshold_key = 'phases32' full_dataset = list() for i in range(9): sub = "sub-%03d" % (i + 1) filename = os.path.join( path, sub, sub + "_space-sensor_window-500_atlas-subject_band-mu_%s.mat" % (task)) mat = h5py.File(filename, 'r') vector = []
house = house.drop(['id', 'date'], axis=1) # **Pairplot Visualisation** # # Let's create some Seaborn pairplots for the features ('sqft_lot','sqft_above','price','sqft_living','bedrooms') to get a feel for how the various features are distributed vis-a-vis the price as well as the number of bedrooms # In[ ]: #sns.pairplot(house[['sqft_lot','sqft_above','price','sqft_living','bedrooms']], hue='bedrooms', palette='afmhot',size=1.4) # In[ ]: with sns.plotting_context("notebook", font_scale=2.5): g = sns.pairplot( house[['sqft_lot', 'sqft_above', 'price', 'sqft_living', 'bedrooms']], hue='bedrooms', palette='tab20', size=6) g.set(xticklabels=[]) # From the pairplots, we seem to get the classical linear distribution of the data points, for example with price against sqft_living. This bodes well as in the latter analysis, we will implement some linear models which we will use in our Feature ranking. Let's look at the correlation heatmap: # In[ ]: str_list = [] # empty list to contain columns with strings (words) for colname, colvalue in house.iteritems(): if type(colvalue[1]) == str: str_list.append(colname) # Get to the numeric columns by inversion num_list = house.columns.difference(str_list) # Create Dataframe containing only numerical features
from sklearn.model_selection import train_test_split, cross_val_score # Removes scientific notation np.set_printoptions(suppress=True) # Loading data data = pd.read_csv("Dataset.csv") x_title = ['Tm', ' Pr', 'Th', 'Sv'] y_title = 'Idx' x_original = data[x_title] y_original = data.Idx x_train, x_test, y_train, y_test = train_test_split(x_original, y_original, test_size=0.5, random_state=1) # Plotting the Data scatter Linear Regression======================================= seaborn.pairplot(data, x_vars=x_title, y_vars=y_title, size=7, aspect=1) matplot.show() #plotting the original graphs and linear regression x_attributes = ["Tm", " Pr", "Th", "Sv"] x_labels = ['Tempurature', 'Pressure', 'Thermal Conductivity', 'Sound Velocity'] for count, x_attr in enumerate(x_attributes): matplot.scatter(x_original[x_attr], y_original) liLSM = LinearRegression() liLSM.fit(x_train[x_attr].reshape(-1,1), y_train) y_predict = liLSM.predict(x_test[x_attr].reshape(-1,1)) matplot.plot(x_test[x_attr].reshape(-1,1), y_predict, 'r') matplot.legend(['Predicted line','Observed data']) matplot.xlabel(x_labels[count]) matplot.ylabel('Chem Index')
# %% # Now let's compare google to itself sns.jointplot('GOOG', 'GOOG', tech_returns, kind='scatter', color='seagreen') # %% # That's a perfect linear relationship, and that makes sense, since we are comparing google to google. # %% # Now let's check if there are relationships between different tech stocks sns.jointplot('GOOG', 'MSFT', tech_returns, kind='scatter', color='seagreen') # %% # Now let's do some plots that will make it easy to compare the tech stocks on our list tech_returns.head() # %% sns.pairplot(tech_returns.dropna()) # %% sns.pairplot(tech_returns.dropna(), kind="reg") # %% sns.pairplot(tech_returns.dropna(), kind="reg", diag_kind='kde') # %% # Just so we can have an idea on how to interpret these graphs: from IPython.display import SVG SVG(url='http://upload.wikimedia.org/wikipedia/commons/d/d4/Correlation_examples2.svg') # %% # The above visualizations show a an interesting correlation between Google and Amazon daily returns # We can dig a little deeper and use a PairGrid to see a more detailed and controled plot between those two. # %% returns_fig = sns.PairGrid(tech_returns.dropna()) returns_fig.map_upper(plt.scatter, color='purple')
import matplotlib.pyplot as plt #https://matplotlib.org/ df1.groupby('gender').size() df1.groupby('gender').size().plot(kind='bar') plt.hist(df1['marks']) #https://seaborn.pydata.org/index.html import seaborn as sns # sns.set(style="ticks", color_codes=True) iris = sns.load_dataset("iris") iris.head() iris.tail() df1.groupby('gender').size() iris.groupby('species').size().plot(kind='bar') sns.pairplot(iris) #%% #Load Inbuilt Datasets import statsmodels.api as sm #https://vincentarelbundock.github.io/Rdatasets/datasets.html mtcars = sm.datasets.get_rdataset(dataname='mtcars', package='datasets') mtcars.data.head() mtcars.data.tail() mtcars.data.columns #%% #Load from Excel/ CSV and export to data = mtcars.data data.head(6) type(data)
print('='*45) # croosstab with catagorical for c in category : table = pd.crosstab(data[c],data['y']) table.plot(kind='bar') # for numeric variable corelation = data.corr() ax=plt.subplots(figsize=(9,7)) sns.heatmap(corelation,annot = True) ### multivaite analysis sns.pairplot(data,hue='y',palette='coolwarm') from sklearn.preprocessing import LabelEncoder le = LabelEncoder() def categorical_variable(dataframe): variable_name=[i for i in dataframe.columns if dataframe.dtypes[i]=='object'] for x in variable_name: dataframe[x]=le.fit_transform(dataframe[x]) return dataframe categorical_variable(transformed_data) transformed_data.columns # feature selecition from sklearn.model_selection import train_test_split
df_company[df_company['kmean'] == 0][4], s=100, c='red', label='Cluster 1') plt.scatter(df_company[df_company['kmean'] == 1][2], df_company[df_company['kmean'] == 1][4], s=100, c='blue', label='Cluster 1') plt.scatter(df_company[df_company['kmean'] == 2][2], df_company[df_company['kmean'] == 2][4], s=100, c='green', label='Cluster 1') sns.pairplot(df_company, hue='kmean') # ############################ Hierarchical Clustering #l1 = [df['EduDegree'],df['HasChild'],df['GeoLivArea']] #df['hc_split'] = pd.concat(, axis=1 ) df['hc_split'] = df['EduDegree'].map(str) + df['HasChild'].map( str) + df['GeoLivArea'].map(str) sns.countplot('hc_split', data=df) df.drop(['EduDegree', 'HasChild', 'GeoLivArea'], inplace=True, axis=1) df.drop(['CustId'], inplace=True, axis=1) df_hc = df.drop(['hc_split'], axis=1)
def main(): df_train=pd.read_csv('../input/train.csv') df_test=pd.read_csv('../input/test.csv') sns.set() sns.pairplot(df_train[["bone_length", "rotting_flesh", "hair_length", "has_soul", "type"]], hue="type") # sns.plt.show() df_train['hair_soul'] = df_train['hair_length'] * df_train['has_soul'] df_train['hair_bone'] = df_train['hair_length'] * df_train['bone_length'] df_train['hair_soul_bone'] = df_train['hair_length'] * df_train['has_soul'] *df_train['bone_length'] df_test['hair_soul'] = df_test['hair_length'] * df_test['has_soul'] df_test['hair_bone'] = df_test['hair_length'] * df_test['bone_length'] df_test['hair_soul_bone'] = df_test['hair_length'] * df_test['has_soul'] * df_test['bone_length'] test_id = df_test['id'] df_train.drop(['id'], axis=1, inplace=True) df_test.drop(['id'], axis=1, inplace=True) df_train.drop(['color'], axis=1, inplace=True) df_test.drop(['color'], axis=1, inplace=True) df_train_data = df_train.drop('type', axis=1) df_train_results=df_train['type'] df_train_data = pd.get_dummies(df_train_data) df_test_data = pd.get_dummies(df_test) test_results=run_classifier(df_train_data,df_train_results,df_test_data, 'rf') save_result(test_id, test_results,'results_logistic_regression.csv')