def multidimensional_plots(df, target_name, maxevents=10000, standardize=False): # randomize the data frame order df_random = df.reindex(np.random.permutation(df.index))[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def _radviz(frame, ax, ax_conf, class_column, color_values=None): radviz(frame, class_column, ax=ax, color=color_values) # ---- configuración de leyenda if ax_conf.legend_show: _c = ax_conf.legend_edge_color ax.legend(prop={'size': ax_conf.legend_size}, loc=ax_conf.legend_loc, fancybox=True).get_frame().set_edgecolor(_c) # ---- configuración de tick - visualización, labels, colors ax.get_xaxis().set_visible(ax_conf.x_axis_show) ax.get_yaxis().set_visible(ax_conf.y_axis_show) ax.tick_params(axis='x', colors=ax_conf.x_axis_color) ax.tick_params(axis='y', colors=ax_conf.y_axis_color) ax.set_xlabel(ax_conf.x_axis_label, labelpad=-1) ax.xaxis.label.set_color(ax_conf.x_color_label) ax.set_ylabel(ax_conf.y_axis_label, labelpad=-1) ax.yaxis.label.set_color(ax_conf.y_color_label) # configuración de spines ax.spines['top'].set_color(ax_conf.color_top_spine) ax.spines['bottom'].set_color(ax_conf.color_bottom_spine) ax.spines['left'].set_color(ax_conf.color_left_spine) ax.spines['right'].set_color(ax_conf.color_right_spine)
def multidimensional_plots(df, target_name, maxevents=10000): # normalize df_std = (df - df.mean()) / df.std() # put the unnormalized target back df_std[target_name] = df[target_name] # randomize the data frame order df_random = df_std.reindex(np.random.permutation(df_std.index)) # make sure this doesn't take too long if df_random.shape[0] > maxevents: df_random = df_random[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def _radviz(frame, ax, ax_conf, class_column, color_values=None): radviz(frame, class_column, ax=ax, color=color_values) # ---- configuración de leyenda if ax_conf.legend_show: _c = ax_conf.legend_edge_color ax.legend(prop={ 'size': ax_conf.legend_size }, loc=ax_conf.legend_loc, fancybox=True).get_frame().set_edgecolor(_c) # ---- configuración de tick - visualización, labels, colors ax.get_xaxis().set_visible(ax_conf.x_axis_show) ax.get_yaxis().set_visible(ax_conf.y_axis_show) ax.tick_params(axis='x', colors=ax_conf.x_axis_color) ax.tick_params(axis='y', colors=ax_conf.y_axis_color) ax.set_xlabel(ax_conf.x_axis_label, labelpad=-1) ax.xaxis.label.set_color(ax_conf.x_color_label) ax.set_ylabel(ax_conf.y_axis_label, labelpad=-1) ax.yaxis.label.set_color(ax_conf.y_color_label) # configuración de spines ax.spines['top'].set_color(ax_conf.color_top_spine) ax.spines['bottom'].set_color(ax_conf.color_bottom_spine) ax.spines['left'].set_color(ax_conf.color_left_spine) ax.spines['right'].set_color(ax_conf.color_right_spine)
def kolka(wywalmn5=True, doddowyw=[]): doddowyw = list(doddowyw) # to sa te kolka if (wywalmn5): mn5 = [] #mniej niz 5 for col in dane.columns: if len(dane[col].unique()) < 5: mn5.append(col) # print(col) from pandas.tools.plotting import radviz exclude = [['godziny', 'oc_sem', 'oc_rok', 'uczelnia'] ] #wyrzucam wszystkie nie-inty, mozna wyrzucac dodatkowe exclude.append(mn5) exclude.append(doddowyw) exclude = [item for sublist in exclude for item in sublist] daneint = dane[dane.columns.difference(exclude)] fig = plt.figure(figsize=(10, 10)) for nazwa in daneint.columns: plt.title(nazwa) radviz(daneint, nazwa) plt.savefig("radviz_" + len(exclude) + "_" + nazwa + ".png") #zapisywanie do pliku plt.clf()
def test_radviz(pandas=False, outpath=None): """ Runs the radviz visualizer on the dataset. Parameters ---------- pandas : bool Run the pandas version of the function outpath : path or None Save the figure to disk rather than show (if None) """ data = load_data('occupancy') # Load the data features = ['temp', 'humid', 'light', 'co2', 'hratio'] classes = ['unoccupied', 'occupied'] X = data[features].as_matrix() y = data.occupied.as_matrix() if pandas: radviz(data[features + ['occupied']], 'occupied') if outpath: plt.savefig(outpath) else: plt.show() else: visualizer = RadViz( # Instantiate the visualizer classes=classes, features=features) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath=outpath) # Draw/show/poof the data
def visualize(config): # Create various visualizations of the data, this would help to create a feature vector for dataset in config['datasets']: scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde') fig_name = dataset['name'] + '_scatter_matrix' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20, 20)) parallel_coordinates(dataset['df'], 'quality') fig_name = dataset['name'] + '_parallel_coordinates' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20, 20)) radviz(dataset['df'], 'quality') fig_name = dataset['name'] + '_radviz' + '.png' plt.savefig(fig_name) plt.close() return OK
def multidimensional_plots(df, target_name, maxevents=10000): # normalize df_std = (df - df.mean())/df.std() # put the unnormalized target back df_std[target_name] = df[target_name] # randomize the data frame order df_random = df_std.reindex(np.random.permutation(df_std.index)) # make sure this doesn't take too long if df_random.shape[0] > maxevents: df_random = df_random[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def plot_radviz(dataset): """ Generates a RadViz plot of the provided DataSet. RadViz is useful for visualizing data with more than two dimensions. """ # radviz takes a pandas DataFrame and the name of the column which # contains class membership info. # therefore need to pass in the dataset's merged data and labels radviz(dataset.get_labelled_data_frame(), dataset.get_labels().name) plt.show()
def Evt_Multi_D_Radviz_Plot(self, event): page = self.New_Tab.GetSelection() panel = self.New_Tab.GetPage(page) self.selected_checkbox() panel.canvas.figure.clf() data_list = list() for variable in self.selected_checkboxes: data_list.append(variable[1]) data_list.append("customer_number") data = self.data[data_list][self.minimum: self.maximum] radviz(data, "customer_number") panel.canvas.draw() return
def Evt_Multi_D_Radviz_Plot(self, event): page = self.New_Tab.GetSelection() panel = self.New_Tab.GetPage(page) self.selected_checkbox() panel.canvas.figure.clf() data_list = list() for variable in self.selected_checkboxes: data_list.append(variable[1]) data_list.append("customer_number") data = self.data[data_list][self.minimum:self.maximum] radviz(data, "customer_number") panel.canvas.draw() return
def plot_radviz_comparison(self, category_column, columns=[], rows=[], filters={}, point_size=30): """return plot axis of radviz graph RadViz is a way of visualizing multi-variate data. It is based on a simple spring tension minimization algorithm. Basically you set up a bunch of points in a plane. In our case they are equally spaced on a unit circle. Each point represents a single attribute. You then pretend that each sample in the data set is attached to each of these points by a spring, the stiffness of which is proportional to the numerical value of that attribute (they are normalized to unit interval). The point in the plane, where our sample settles to (where the forces acting on our sample are at an equilibrium) is where a dot representing our sample will be drawn. Depending on which class that sample belongs it will be colored differently. """ col_names = self._df.drop('Molecule', axis=1).columns.tolist() if category_column not in col_names: raise ValueError('{0} not in columns'.format(category_column)) if columns and category_column not in columns: if all(isinstance(item, int) for item in columns): columns.append(col_names.index(category_column)) else: columns.append(category_column) df = self.get_table(rows, columns, filters) ax = radviz(df, category_column, s=point_size) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) #ax.set_frame_on(False) return ax
def test_radviz(self): from pandas.tools.plotting import radviz from matplotlib import cm df = self.iris _check_plot_works(radviz, df, 'Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(radviz, df, 'Name', color=rgba) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches[:10], facecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] _check_plot_works(radviz, df, 'Name', color=cnames) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) _check_plot_works(radviz, df, 'Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) colors = [[0., 0., 1., 1.], [0., 0.5, 1., 1.], [1., 0., 0., 1.]] df = DataFrame({ "A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ['b', 'g', 'r'] }) ax = radviz(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors)
def test_radviz(self): from pandas.tools.plotting import radviz from matplotlib import cm df = self.iris _check_plot_works(radviz, df, 'Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(radviz, df, 'Name', color=rgba) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches[:10], facecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] _check_plot_works(radviz, df, 'Name', color=cnames) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) _check_plot_works(radviz, df, 'Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) colors = [[0., 0., 1., 1.], [0., 0.5, 1., 1.], [1., 0., 0., 1.]] df = DataFrame({"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ['b', 'g', 'r']}) ax = radviz(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors)
def Clonal_Evolution_Multidimensional_Data(self): i = 0.0 Clonal_Evolution_df = pd.DataFrame() for df in DataStructs: if (i == 0): t = [i] * len(df) Clonal_Evolution_df = df Clonal_Evolution_df['t'] = pd.Series( t, index=Clonal_Evolution_df.index) else: t = [i] * len(df) df['t'] = pd.Series(t, index=df.index) Clonal_Evolution_df = pd.concat([Clonal_Evolution_df, df], ignore_index=True) i = i + 1.0 C = Clonal_Evolution_df['ID'] S = Clonal_Evolution_df['Size'] M = Clonal_Evolution_df['MR'] P = Clonal_Evolution_df['PR'] T = Clonal_Evolution_df['t'] Normalised_df = pd.DataFrame(zip(T / max(T), S / max(S), P / max(P), M / max(M), C), columns=['t', 'Size', 'PR', 'MR', 'ID']) plt.figure() parallel_coordinates(Normalised_df, 'ID', colormap='jet').set_title("PC Plot") plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_Parallel_Coords_Plot.eps', format='eps', dpi=1000) plt.figure() andrews_curves(Normalised_df, 'ID', colormap='jet') plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_Andrews_Curves_Plot.eps', format='eps', dpi=1000) plt.figure() radviz(Normalised_df, 'ID', colormap='jet') plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_RadViz_Plot.eps', format='eps', dpi=1000)
def t4(tp='r'): # 可视化 conda install pandas 多维数据 可视化 # http://cloga.info/%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/2016/10/12/multivariate-data-visualization import pandas as pd import matplotlib.pyplot as plt data = pd.read_csv('file:///e:/stock/xx1') from pandas.tools.plotting import andrews_curves from pandas.tools.plotting import parallel_coordinates from pandas.tools.plotting import radviz plt.figure() if tp == 'r': radviz(data, 'Name') elif tp == 'a': andrews_curves(data, 'Name') elif tp == 'p': parallel_coordinates(data, 'Name') plt.show()
def radvizPlot(base, classe): plt.figure(figsize=(10, 8)) ax = radviz(base, classe) ax.legend(loc='center left', bbox_to_anchor=(0, 1), fancybox=True, ncol=2, fontsize='x-small') plt.ylim([-2, 2]) plt.show()
def radial_plot(self, data=None, labels=None, x_label=None, y_label=None, title=None): '''wrapper for pandas radviz''' # TODO set title, labels fig = radviz(data, labels, color=sns.color_palette()) plt.show()
def createRadViz(self,data,base_dir,fileName): from pandas.tools.plotting import radviz pdf = PdfPages(''.join([base_dir,fileName])) for cols in data.columns.values: if len(data[cols].value_counts()) <= 20 and len(data[cols].value_counts()) > 1: req_data = data._get_numeric_data() req_data[cols]= data[cols] fig = plt.figure() fig = radviz(req_data, cols) fig.set_title(''.join(["plot of radviz vis ", cols])) pdf.savefig(fig.get_figure()) pdf.close()
def visualize(config): # Create various visualizations of the data, this would help to create a feature vector for dataset in config['datasets']: scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde') fig_name = dataset['name'] + '_scatter_matrix' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20,20)) parallel_coordinates(dataset['df'], 'quality') fig_name = dataset['name'] + '_parallel_coordinates' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20,20)) radviz(dataset['df'], 'quality') fig_name = dataset['name'] + '_radviz' + '.png' plt.savefig(fig_name) plt.close() return OK
def plot_radial(data_frame, class_name): plt.clf() radviz(data_frame, class_name) # plt.show(block=False) plt.title('Radial Plot') plt.savefig(join(Status.TEMP_DIR, Status.RADIAL_NAME))
import pandas as pd #import warnings #warnings.filterwarnings("ignore") import seaborn as sns import matplotlib.pyplot as plt from pandas.tools.plotting import parallel_coordinates from pandas.tools.plotting import radviz sns.set(style="white", color_codes=True) iris = pd.read_csv("wine.csv") #iris.head() #print iris #print iris["Wine"].value_counts() #sns.jointplot(x="A1", y="A2", data=iris, kind='reg', size=6) #sns.pairplot(iris, hue="Wine",size=1) parallel_coordinates(iris, "Wine") radviz(iris, "Wine") sns.plt.show()
def test_radviz_deprecated(self): df = self.iris with tm.assert_produces_warning(FutureWarning): plotting.radviz(frame=df, class_column='Name')
parallel_coordinates(df, "Survived") # 8. A final multivariate visualization technique pandas # has is radviz. # In Radviz, each dimension in the dataset is represented by # a dimensional anchor, and each dimensional anchor is distributed # evenly on a unit circle. Each line in the data set corresponds # to a point in the projection, that is linked to every dimensional # anchor by a spring. Each spring’s stiffness corresponds to the # value for that particular thing in that particular dimension. # The position of the point is defined as the point in the 2D space # where the spring’s tension is minimum. # 8.1 First with random data radviz(df1, "a") # There is no pulling anywhere # 8.2 Then with titanic data radviz(df, "Survived") # 9. Next is t-sne tsne= TSNE() # 9.1 First t-sne of random data tsne_results_random = tsne.fit_transform(df1.iloc[: , 1:], df1.iloc[: , 0:]) # 9.2 Next, t-sne of titanic data tsne_results_titanic = tsne.fit_transform(df.iloc[:, 1:], df.iloc[:, 0]) # 10. Plot the two results # 10.1 First deep copy of random data df1_tsne = df1.iloc[:, 1:].copy() # 10.2 Here is X-axis points
# create a plot for each metric for metric in stats: print('Plotting metric: %s' % metric) df = df.sort(metric, ascending=True) fig = plt.figure(figsize=(18, 18)) axes = plt.Axes(fig, [.2, .1, .7, .8]) # [left, bottom, width, height] fig.add_axes(axes) df[metric].plot(kind='barh', title=metric, alpha=0.7) plt.savefig('images/' + metric.replace(' ', '-')) # create csv df.to_csv('data/todomvc-metrics.csv') # create radviz from pandas.tools.plotting import radviz df_rad = df[['Sum Logical SLOC', 'Mean Cyclomatic Complexity', 'Sum Halstead Time', 'Mean Maintainability Index']] df_rad['Name'] = df_rad.index.tolist() fig = plt.figure(figsize=(18, 18)) ax = radviz(df_rad, 'Name') legend = ax.legend(fontsize='xx-small', fancybox=True, ncol=3) plt.setp(legend.get_title(), fontsize='xx-small') plt.savefig('images/radviz')
from pandas.tools.plotting import radviz data = pd.read_csv('data/iris.data') plt.figure() radviz(data, 'Name')
df.to_csv(OUTPATH, sep='\t', index=False, header=False) print "Wrote dataset of %i instances and %i attributes to %s" % (df.shape + (OUTPATH,)) with open('meta.json', 'w') as f: meta = {'feature_names': FEATURES, 'target_names': LABEL_MAP} json.dump(meta, f, indent=4) # Describe the dataset print df.describe() # Determine the shape of the data print "{} instances with {} features\n".format(*df.shape) # Determine the frequency of each class print df.groupby('label')['label'].count() # Create a scatter matrix of the dataframe features scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde') plt.show() # Parallel Coordinates plt.figure(figsize=(12,12)) parallel_coordinates(df,'label') plt.show() # Radviz plt.figure(figsize=(12,12)) radviz(df, 'label') plt.show()
ax.set_zlabel("Z Label") ax.set_xticks(df.finalDate[x].apply(lambda d: d.strftime("%Y-%m-%d")).values) plt.show() #################Several plots in 2D df.plot(subplots=True) df.plot(x="finalDate", y="finalBalance") df.hist(by=["windowSize", "trainingSize"]) df.boxplot("finalBalance", by=["windowSize", "trainingSize"]) scatter_matrix(df, alpha=0.2, diagonal="kde") df.plot(x="finalDate", y="finalBalance", kind="kde") parallel_coordinates(df, "windowSize") autocorrelation_plot(df.finalBalance) radviz(df, "finalBalance") df.plot(colormap="jet") #################More specific plots in 2D f, (ax1, ax2) = plt.subplots(2, 3) ax1[0].plot(df.groupby(["windowSize"]).mean()["finalBalance"]) ax1[0].set_title("Window Size Mean") ax1[0].set_ylim((5000, 15000)) ax2[0].plot(df.groupby(["windowSize"]).sum()["finalBalance"]) ax2[0].set_title("Window Size Sum") ax1[1].plot(df.groupby(["trainingSize"]).mean()["finalBalance"]) ax1[1].set_title("Training Size Mean") ax1[1].set_ylim((5000, 15000))
def format_data(): path = "data/Honoraires_totaux_des_professionnels_de_sante_par_departement_en_2013.xls" data = pd.read_excel(path, sheetname=[1, 2, 3, 4, 5, 6]) df = data[2] df = df.replace('nc', np.nan).dropna() df = df[df['DEPARTEMENT'].str.contains('- ')] df = df[df['SPECIALISTES'].str.contains('- ')] df = df[df['EFFECTIFS'] > 0] dep = pd.DataFrame([[x] + x.split('- ') for x in df['DEPARTEMENT']], columns=['DEPARTEMENT', 'num_dep', 'name_dep']) spec = pd.DataFrame([[x] + x.split('- ') for x in df['SPECIALISTES']], columns=['SPECIALISTES', 'num_spec', 'name_spec']) df = pd.concat([df, dep, spec], axis=1, join='inner') df['num_dep'] = df['num_dep'].str.replace('^0', '').str.replace( 'B', '.5').str.replace('A', '.25').astype('float') df['num_spec'] = df['num_spec'].replace('^0', '').astype('float') df['DEPASSEMENTS (euros)'] = df['DEPASSEMENTS (euros)'].astype('float') print(df.head()) df['NOMBRE DE DEPASSEMENTS (/medecin)'] = df['NOMBRE DE DEPASSEMENTS'] / df['EFFECTIFS'] df['DEPASSEMENTS (euros/medecin)'] = df['DEPASSEMENTS (euros)'] / df['EFFECTIFS'] #df['DEPASSEMENT MOYEN (euros/medecin)'] = df['DEPASSEMENT MOYEN (euros)'] / df['EFFECTIFS'] #------------ histograme 'DEPASSEMENTS (euros)' ------------------------------- plt.figure() plt.title('on considere les DEPASSEMENTS totaux > 5e6€ comme outliers (on le retire)') df['DEPASSEMENTS (euros)'].plot(kind='hist', stacked=True, bins=100) plt.xlim(0, 6e6) #df = df[df['DEPASSEMENTS (euros)'] <= 5.e6] #--------------scatter_matrix ------------------------------------------- dat = df[['num_spec', 'NOMBRE DE DEPASSEMENTS', 'DEPASSEMENT MOYEN (euros)', 'DEPASSEMENTS (euros)', 'EFFECTIFS']] scatter_matrix(dat, diagonal='kde') # ---------- corrélations entre les variables, par numero de spécialité -------------- dat = df[['num_spec', 'NOMBRE DE DEPASSEMENTS', 'DEPASSEMENT MOYEN (euros)', 'DEPASSEMENTS (euros)', 'EFFECTIFS']] corr_mat = dat.corr() plt.figure() sns.heatmap(corr_mat, square=True) plt.figure() plt.title('Sans prendre en compte le departement.\n Un EFFECTIF plus faible semble corrélé à des\n' + ' NOMBRE DE DEPASSEMENTS et DEPASSEMENTS (totaux) plus elevés.\n' + 'Il semble également que' + " les EFFECTIFS soient mal repartis d'une specialite à l'autre.") radviz(dat, 'num_spec') print(df.head()) return df
def rad_viz(df,labels): fig = radviz(df, labels, color=sns.color_palette()) plt.show()
df = pd.DataFrame(stats) # create a plot for each metric for metric in stats: df = df.sort(metric, ascending=True) fig = plt.figure() axes = plt.Axes(fig, [.2, .1, .7, .8]) # [left, bottom, width, height] fig.add_axes(axes) df[metric].plot(kind='barh', title=metric, alpha=0.7) plt.savefig('images/' + metric.replace(' ', '-')) # create csv df.to_csv('data/todomvc-metrics.csv') # create radviz from pandas.tools.plotting import radviz df_rad = df[ ['Sum Logical SLOC', 'Mean Cyclomatic Complexity', 'Sum Halstead Time', 'Mean Maintainability Index']] df_rad['Name'] = df_rad.index.tolist() fig = plt.figure() radviz(df_rad, 'Name') plt.savefig('images/radviz')
life # <codecell> from pandas.tools.plotting import radviz # <codecell> #rn = range(0,56) #for n in rn: # print df.values[-n] plt.figure() # <codecell> radviz(ang, '2010') # <codecell> pops = pd.read_csv('subsaharan_africa.csv', index_col=17,na_values=None) # <codecell> pops.columns # <codecell> from sklearn import cross_validation pops = pops.dropna(axis=0) # <codecell>
def v_radviz(sdf, features, target): radviz(sdf[features], target)
plt.savefig('databox.eps', format='eps', dpi=600) plt.show() setosa.boxplot() plt.title('Iris-setosa boxplot') plt.savefig('setobox.eps', format='eps', dpi=600) plt.show() versi.boxplot() plt.title('Iris-versi boxplot') plt.savefig('versibox.eps', format='eps', dpi=600) plt.show() verginica.boxplot() plt.title('Iris-verginica boxplot') plt.savefig('verginicabox.eps', format='eps', dpi=600) plt.show() andrews_curves(data, 'irisclass').legend(bbox_to_anchor=(0.4, 1)) plt.savefig('andrews_curve.eps', format='eps', dpi=600) radviz(data, 'irisclass').legend(bbox_to_anchor=(1.1, 1)) plt.savefig('radviz.eps', format='eps', dpi=600) plt.show() parallel_coordinates(data, 'irisclass').legend(bbox_to_anchor=(1, 1)) plt.savefig('paracoor.eps', format='eps', dpi=600) plt.show() #plot scatter, correlation sns.set(style="ticks") sns.pairplot(data, hue="irisclass") plt.savefig('scatermatrix.eps', format='eps', dpi=600) plt.show()
'''データの読み込み。Windows環境はencoding='cp932'必須''' iris = pd.read_csv('.//iris2.csv',index_col=0,encoding='cp932') '''最初の5つを表示''' print iris.head(n=5) '''概要''' iris_ab=iris.describe() print iris_ab '''相関係数行列''' iris_co=iris.corr() print iris_co '''散布図行列''' pd.scatter_matrix(iris,color='green',diagonal='kde',figsize=(6,6)) '''radviz''' plt.figure() radviz(iris,u"種類") '''箱ひげ''' iris.boxplot(by=u"種類") plt.show() iris_co.to_csv('iris_corr.csv')
# -*- coding: utf-8 -*- """ Created on Thu Feb 12 09:48:33 2015 @author: dusty """ ##---(Thu Feb 12 09:43:40 2015)-- from pandas.tools.plotting import radviz import matplotlib.pyplot as plt import pandas as pd plt.figure() data = pd.read_csv( '/Users/Dusty/Documents/Machine Learning/Vixie/etongueData.csv') #data = data.drop('Sample', 1) data = data[['Sample', 'SRS', 'GPS', 'STS', 'UMS', 'SPS', 'SWS', 'BRS']] radviz(data, 'Sample') plt.show()
sns.heatmap( train[columns].corr(), annot=True, cmap='cubehelix_r' ) #draws heatmap with input as the correlation matrix calculted by(iris.corr()) plt.show() # <a id="6210"></a> <br> # ### 6-2-10 radviz # In[ ]: from pandas.tools.plotting import radviz columns = [ 'SalePrice', 'OverallQual', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', 'FullBath', 'YearBuilt', 'YearRemodAdd' ] radviz(train[columns], "OverallQual") # <a id="6212"></a> <br> # ### 6-2-12 Factorplot # In[ ]: sns.factorplot('OverallQual', 'SalePrice', hue='Functional', data=train) plt.show() # <a id="63"></a> <br> # ## 6-3 Data Preprocessing # **Data preprocessing** refers to the transformations applied to our data before feeding it to the algorithm. # # Data Preprocessing is a technique that is used to convert the raw data into a clean data set. In other words, whenever the data is gathered from different sources it is collected in raw format which is not feasible for the analysis. # there are plenty of steps for data preprocessing and we just listed some of them :
# between each pair of features # # From the pairplot, we'll see that the Iris-setosa species is separataed from the other # two across all feature combinations sb.pairplot(dataset.drop("Id", axis=1), hue="Species", size=3, diag_kind="kde") # Now that we've covered seaborn, let's go back to some of the ones we can make with Pandas # We can quickly make a boxplot with Pandas on each feature split out by species dataset.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6)) # A final multivariate visualization technique pandas has is radviz # Which puts each feature as a point on a 2D plane, and then simulates # having each sample attached to those points through a spring weighted # by the relative value for that feature from pandas.tools.plotting import radviz radviz(dataset.drop("Id", axis=1), "Species") dataset.info() #checking if there is any inconsistency in the dataset #as we see there are no null values in the dataset, so the data can be processed # convert Species column from catecorical to numerical values from sklearn.preprocessing import LabelEncoder numbers = LabelEncoder() dataset['Species_encoded'] = numbers.fit_transform(dataset.Species) y_train # split data X = dataset.iloc[:, :-2] y = dataset.iloc[:, 6]
# parallel co ordinate plot from pandas.tools.plotting import parallel_coordinates plt.figure(figsize=(18,6)) parallel_coordinates(sampleData, '_user_id') # andrews curves from pandas.tools.plotting import andrews_curves plt.figure(figsize=(18,6)) andrews_curves(sampleData,'_user_id') # radviz spring constant plot from pandas.tools.plotting import radviz plt.figure(figsize=(12,10)) radviz(sampleData,"_user_id") # As can be observed from above plots data is very closely spaced without any apparent linear or non-linear boundaries # Initial inference - A tree based approach might work better when compared to a kernel based boundary fitting approach # plotting predictor variables, not considering feat8 and feat16 here as majority of them would be replaced missing values features= ["feat11","feat13","feat5","feat2","feat1","feat4","feat7","feat14","feat10","feat15","feat21","feat3", "feat18","feat9","feat17","feat20","feat6","feat12","feat19","feat22"] workingData[features].hist() # There are few features with outliers, however, I am not handling outliers as this is health data and anomalies might #help classification accuracy #============================================================================================================
data = data.dropna() data = data[data != 0] #Get rid of incomplete values data.origin.value_counts() #245 American cars, 79 Asian cars, 70 European cars data['model year'] = data['model year'] - 70 #set years equal to number of years after 1970 data['horsepower'] = data['horsepower'].astype(int) from pandas.tools.plotting import radviz radviz(data.drop(["name","model year"], axis=1), "origin") plt.show() #American cars have much larger displacement than asian and european cars #Asian and european cars tend to have higher acceleration and mpg compared to most #american cars. sns.violinplot(x = 'origin', y = 'mpg', data = data) plt.show() #American cars tend to have mpg about 15 mpg, european cars have about 25 mpg, and #asian cars have about 32 mpg sns.violinplot(x = 'origin', y = 'displacement', data = data) plt.show() #American engines are MUCH bigger than Asian and European engines, which are about the
new_word = word["stem"] + suffix if suffix == "NULL": new_word = word["stem"] occur = word["instances"][new_word][0] * 1.0 if occur > maximum: maximum = occur if occur < minimum: minimum = occur values[suffix].append(occur) for suffix in suffixes: values[suffix].append(maximum) for index in values: if(min(values[index]) == max(values[index]) and len(values[index]) != 1): replace(values[index]) data = pd.DataFrame(values) ax = radviz(data, "Name", color = colors) ax.legend_.remove() if test: print data.iloc[[1]] plt.show() print "done" plt.show() def normalize(array): maximum = max(array) return map(lambda x: x / maximum, array) def generate_points_on_circle(num): arr = [] for i in range(num): rad = i * 2 * math.pi / num
# Determine the shape of the data print("{} instances with {} features\n".format(*df_red.shape)) # Determine the frequency of each class print(df_red.groupby('quality').count()) %matplotlib inline # Create a scatter matrix of the dataframe features from pandas.tools.plotting import scatter_matrix scatter_matrix(df_red, alpha=0.2, figsize=(12, 12), diagonal='kde') plt.show() from pandas.tools.plotting import parallel_coordinates plt.figure(figsize=(12,12)) parallel_coordinates(df_red, 'quality') plt.show() from pandas.tools.plotting import radviz plt.figure(figsize=(12,12)) radviz(df_red, 'quality') plt.show() #next start the scikit learn portion. regression or #good for this data
def test_radviz_deprecated(self, iris): with tm.assert_produces_warning(FutureWarning): plotting.radviz(frame=iris, class_column='Name')
# Seaborn package scatter plot that labels each species data with different colour # used for idenitfying bivariate relationships sns.FacetGrid(iris, hue="Species", size=5).map(plt.scatter, "SepalLengthCm", "SepalWidthCm").add_legend() # Seaborn package Box plot # useful for exploring univariate relationships sns.boxplot(x="Species", y="PetalLengthCm", data=iris) # Seaborn Kdeplot - plots density of feature measurements # useful for exploring univariate relationships sns.FacetGrid(iris, hue="Species", size=6).map(sns.kdeplot, "PetalLengthCm").add_legend() # Pandas Radviz - puts each feature on 2D plane, based on spring tension minimization algorithm # useful for multivariate visualization from pandas.tools.plotting import radviz radviz(iris.drop("Id", axis=1), "Species") # Seaborn pairplot - plots all pairs of features # useful for exploring bivariate relationships sns.pairplot(iris.drop("Id", axis=1), hue="Species")
def plot_main(): """ moyenne """ fig, axes = plt.subplots(2, 4, figsize=(10, 16)) values = pd.Series(df["moyenne"]) df.boxplot(column="moyenne", by="classe", ax=axes[0][0]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) # plt.savefig('kde_boxplot_moyenne.png') # plt.close() pass """ ecart-type """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["ecart-type"]) df.boxplot(column="ecart-type", by="classe", ax=axes[0][1]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_ecart-type.png') plt.close() pass """ mediane """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["mediane"]) df.boxplot(column="mediane", by="classe", ax=axes[0][2]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_mediane.png') plt.close() pass """ entropie """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["entropie"]) df.boxplot(column="entropie", by="classe", ax=axes[0][3]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_entropie.png') plt.close() pass """ uniformit """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["uniformit"]) df.boxplot(column="uniformit", by="classe", ax=axes[1][0]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_uniformit.png') plt.close() pass """ surface """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["surface"]) df.boxplot(column="surface", by="classe", ax=axes[1][1]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_surface.png') plt.close() pass """ eccent""" fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["eccentricity"]) df.boxplot(column="eccentricity", by="classe", ax=axes[1][2]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_eccentricitie.png') plt.close() pass """ Andrews Curves & parallel coordinates ploting """ fig, axes = plt.subplots(2, 1, figsize=(10, 17)) andrews_curves(df, 'classe', ax=axes[0]) parallel_coordinates(df, 'classe', ax=axes[1]) plt.savefig('Andrew_curves_df2.png') plt.close() """ matrice des correlations """ fig, axes = plt.subplots(1, 1, figsize=(10, 10)) scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde', ax=axes) plt.savefig('matrix_corr_kde.png') plt.close() pass """ RadViz visualizing multi-variate data""" fig, axes = plt.subplots(1, 1, figsize=(10, 10)) radviz(df, "classe", ax=axes) plt.savefig('RadViz_df.png') plt.close()
size=2, diag_kind="kde") # In[ ]: train[['var15', 'var36', 'logvar38', 'TARGET']].boxplot(by="TARGET", figsize=(12, 6)) # In[ ]: # A final multivariate visualization technique pandas has is radviz # Which puts each feature as a point on a 2D plane, and then simulates # having each sample attached to those points through a spring weighted # by the relative value for that feature from pandas.tools.plotting import radviz radviz(train[['var15', 'var36', 'logvar38', 'TARGET']], "TARGET") # # now look at all 8 features together # In[ ]: features # In[ ]: radviz(train[features + ['TARGET']], "TARGET") # In[ ]: sns.pairplot(train[features + ['TARGET']], hue="TARGET",
plt.show() # The diagonal elements in a pairplot show the histogram by default # We can update these elements to show other things, such as a kde sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3, diag_kind="kde") plt.show() # Now that we've covered seaborn, let's go back to some of the ones we can make with Pandas # We can quickly make a boxplot with Pandas on each feature split out by species iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6)) plt.show() # One cool more sophisticated technique pandas has available is called Andrews Curves # Andrews Curves involve using attributes of samples as coefficients for Fourier series # and then plotting these from pandas.tools.plotting import andrews_curves andrews_curves(iris.drop("Id", axis=1), "Species") plt.show() # Another multivariate visualization technique pandas has is parallel_coordinates # Parallel coordinates plots each feature on a separate column & then draws lines # connecting the features for each data sample from pandas.tools.plotting import parallel_coordinates parallel_coordinates(iris.drop("Id", axis=1), "Species") plt.show() # A final multivariate visualization technique pandas has is radviz # Which puts each feature as a point on a 2D plane, and then simulates # having each sample attached to those points through a spring weighted # by the relative value for that feature from pandas.tools.plotting import radviz radviz(iris.drop("Id", axis=1), "Species") plt.show()
##################################################################### from sklearn.datasets import load_iris import pandas as pd import matplotlib.pyplot as plt from pandas.tools.plotting import parallel_coordinates, scatter_matrix, radviz iris = load_iris() df = pd.DataFrame(iris.data) df.columns = iris.feature_names # Boxplot df.plot(kind='box') plt.show() # Histogram df.plot(kind='hist') plt.show() # Radviz df['name'] = iris.target fig = radviz(df, 'name') plt.show() # Scatterplot Matrix fig = scatter_matrix(df, alpha=0.2, diagonal='kde') plt.show() # Parallel Coordinates fig = parallel_coordinates(df, 'name') plt.show()
new_features = lda.fit(features, labels).transform(features) print(new_features) #Machine Learning begining # Create a scatter matrix of the dataframe features from pandas.tools.plotting import scatter_matrix scatter_matrix(listing.dataframe, alpha=0.2, figsize=(12, 12), diagonal='kde') plt.show() from pandas.tools.plotting import parallel_coordinates plt.figure(figsize=(12,12)) parallel_coordinates(listing.dataframe, 'accommodates') plt.show() from pandas.tools.plotting import radviz plt.figure(figsize=(12,12)) radviz(listing.dataframe, 'id') plt.show() #Data Extraction from sklearn.datasets.base import Bunch DATA_DIR = os.path.abspath(os.path.join(".", "..", "team-zero", "data", "listing")) # Show the contents of the data directory for name in os.listdir(DATA_DIR): if name.startswith("."): continue print "- {}".format(name)
def rad_viz(df,labels): fig = radviz(df, labels, color=sns.color_palette()) plt.savefig('figures/rad_viz.png', transparent=True)