예제 #1
0
def multidimensional_plots(df,
                           target_name,
                           maxevents=10000,
                           standardize=False):

    # randomize the data frame order
    df_random = df.reindex(np.random.permutation(df.index))[:maxevents]

    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random,
                         target_name,
                         ax=current_axis,
                         colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
예제 #2
0
def _radviz(frame, ax, ax_conf, class_column, color_values=None):

    radviz(frame, class_column, ax=ax, color=color_values)

    # ---- configuración de leyenda
    if ax_conf.legend_show:
        _c = ax_conf.legend_edge_color
        ax.legend(prop={'size': ax_conf.legend_size},
                  loc=ax_conf.legend_loc,
                  fancybox=True).get_frame().set_edgecolor(_c)

    # ---- configuración de tick - visualización, labels, colors

    ax.get_xaxis().set_visible(ax_conf.x_axis_show)
    ax.get_yaxis().set_visible(ax_conf.y_axis_show)
    ax.tick_params(axis='x', colors=ax_conf.x_axis_color)
    ax.tick_params(axis='y', colors=ax_conf.y_axis_color)

    ax.set_xlabel(ax_conf.x_axis_label, labelpad=-1)
    ax.xaxis.label.set_color(ax_conf.x_color_label)

    ax.set_ylabel(ax_conf.y_axis_label, labelpad=-1)
    ax.yaxis.label.set_color(ax_conf.y_color_label)

    # configuración de spines
    ax.spines['top'].set_color(ax_conf.color_top_spine)
    ax.spines['bottom'].set_color(ax_conf.color_bottom_spine)
    ax.spines['left'].set_color(ax_conf.color_left_spine)
    ax.spines['right'].set_color(ax_conf.color_right_spine)
예제 #3
0
def multidimensional_plots(df, target_name, maxevents=10000):
    # normalize
    df_std = (df - df.mean()) / df.std()
    # put the unnormalized target back
    df_std[target_name] = df[target_name]
    # randomize the data frame order
    df_random = df_std.reindex(np.random.permutation(df_std.index))

    # make sure this doesn't take too long
    if df_random.shape[0] > maxevents:
        df_random = df_random[:maxevents]

    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random,
                         target_name,
                         ax=current_axis,
                         colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
예제 #4
0
def _radviz(frame, ax, ax_conf, class_column, color_values=None):

    radviz(frame, class_column, ax=ax, color=color_values)

    # ---- configuración de leyenda
    if ax_conf.legend_show:
        _c = ax_conf.legend_edge_color
        ax.legend(prop={
            'size': ax_conf.legend_size
        },
                  loc=ax_conf.legend_loc,
                  fancybox=True).get_frame().set_edgecolor(_c)

    # ---- configuración de tick - visualización, labels, colors

    ax.get_xaxis().set_visible(ax_conf.x_axis_show)
    ax.get_yaxis().set_visible(ax_conf.y_axis_show)
    ax.tick_params(axis='x', colors=ax_conf.x_axis_color)
    ax.tick_params(axis='y', colors=ax_conf.y_axis_color)

    ax.set_xlabel(ax_conf.x_axis_label, labelpad=-1)
    ax.xaxis.label.set_color(ax_conf.x_color_label)

    ax.set_ylabel(ax_conf.y_axis_label, labelpad=-1)
    ax.yaxis.label.set_color(ax_conf.y_color_label)

    # configuración de spines
    ax.spines['top'].set_color(ax_conf.color_top_spine)
    ax.spines['bottom'].set_color(ax_conf.color_bottom_spine)
    ax.spines['left'].set_color(ax_conf.color_left_spine)
    ax.spines['right'].set_color(ax_conf.color_right_spine)
예제 #5
0
def kolka(wywalmn5=True, doddowyw=[]):
    doddowyw = list(doddowyw)
    #   to sa te kolka
    if (wywalmn5):
        mn5 = []  #mniej niz 5
        for col in dane.columns:
            if len(dane[col].unique()) < 5:
                mn5.append(col)
        # print(col)

    from pandas.tools.plotting import radviz
    exclude = [['godziny', 'oc_sem', 'oc_rok', 'uczelnia']
               ]  #wyrzucam wszystkie nie-inty, mozna wyrzucac dodatkowe
    exclude.append(mn5)
    exclude.append(doddowyw)
    exclude = [item for sublist in exclude for item in sublist]

    daneint = dane[dane.columns.difference(exclude)]

    fig = plt.figure(figsize=(10, 10))
    for nazwa in daneint.columns:
        plt.title(nazwa)
        radviz(daneint, nazwa)
        plt.savefig("radviz_" + len(exclude) + "_" + nazwa +
                    ".png")  #zapisywanie do pliku
        plt.clf()
예제 #6
0
def test_radviz(pandas=False, outpath=None):
    """
    Runs the radviz visualizer on the dataset.

    Parameters
    ----------
    pandas : bool
        Run the pandas version of the function
    outpath : path or None
        Save the figure to disk rather than show (if None)
    """
    data = load_data('occupancy')  # Load the data
    features = ['temp', 'humid', 'light', 'co2', 'hratio']
    classes = ['unoccupied', 'occupied']
    X = data[features].as_matrix()
    y = data.occupied.as_matrix()

    if pandas:
        radviz(data[features + ['occupied']], 'occupied')
        if outpath:
            plt.savefig(outpath)
        else:
            plt.show()

    else:
        visualizer = RadViz(  # Instantiate the visualizer
            classes=classes, features=features)
        visualizer.fit(X, y)  # Fit the data to the visualizer
        visualizer.transform(X)  # Transform the data
        visualizer.poof(outpath=outpath)  # Draw/show/poof the data
예제 #7
0
def visualize(config):

    # Create various visualizations of the data, this would help to create a feature vector
    for dataset in config['datasets']:
        scatter_matrix(dataset['df'],
                       alpha=0.2,
                       figsize=(20, 20),
                       diagonal='kde')
        fig_name = dataset['name'] + '_scatter_matrix' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20, 20))
        parallel_coordinates(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_parallel_coordinates' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20, 20))
        radviz(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_radviz' + '.png'
        plt.savefig(fig_name)
        plt.close()

    return OK
예제 #8
0
def multidimensional_plots(df, target_name, maxevents=10000):
    # normalize
    df_std = (df - df.mean())/df.std()
    # put the unnormalized target back
    df_std[target_name] = df[target_name]
    # randomize the data frame order
    df_random = df_std.reindex(np.random.permutation(df_std.index))

    # make sure this doesn't take too long
    if df_random.shape[0] > maxevents:
        df_random = df_random[:maxevents]
    
    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
예제 #9
0
def plot_radviz(dataset):
    """
    Generates a RadViz plot of the provided DataSet.  RadViz is useful for 
    visualizing data with more than two dimensions.
    """
    # radviz takes a pandas DataFrame and the name of the column which
    # contains class membership info.
    # therefore need to pass in the dataset's merged data and labels
    radviz(dataset.get_labelled_data_frame(), dataset.get_labels().name)
    plt.show()
예제 #10
0
def plot_radviz(dataset):
    """
    Generates a RadViz plot of the provided DataSet.  RadViz is useful for 
    visualizing data with more than two dimensions.
    """
    # radviz takes a pandas DataFrame and the name of the column which 
    # contains class membership info. 
    # therefore need to pass in the dataset's merged data and labels
    radviz(dataset.get_labelled_data_frame(), dataset.get_labels().name)
    plt.show()
def Evt_Multi_D_Radviz_Plot(self, event):
	page = self.New_Tab.GetSelection()  
	panel = self.New_Tab.GetPage(page)
	self.selected_checkbox()
	panel.canvas.figure.clf()
	data_list = list()
	for variable in self.selected_checkboxes:
		data_list.append(variable[1])

	data_list.append("customer_number")
	data = self.data[data_list][self.minimum: self.maximum]
	radviz(data, "customer_number")
	panel.canvas.draw()
	return
예제 #12
0
def Evt_Multi_D_Radviz_Plot(self, event):
    page = self.New_Tab.GetSelection()
    panel = self.New_Tab.GetPage(page)
    self.selected_checkbox()
    panel.canvas.figure.clf()
    data_list = list()
    for variable in self.selected_checkboxes:
        data_list.append(variable[1])

    data_list.append("customer_number")
    data = self.data[data_list][self.minimum:self.maximum]
    radviz(data, "customer_number")
    panel.canvas.draw()
    return
예제 #13
0
    def plot_radviz_comparison(self, category_column, 
                               columns=[], rows=[], filters={}, point_size=30):
        """return plot axis of radviz graph
        
        RadViz is a way of visualizing multi-variate data. 
        It is based on a simple spring tension minimization algorithm. 
        Basically you set up a bunch of points in a plane. In our case they are 
        equally spaced on a unit circle. Each point represents a single attribute. 
        You then pretend that each sample in the data set is attached to each 
        of these points by a spring, the stiffness of which is proportional to 
        the numerical value of that attribute (they are normalized to unit 
        interval). The point in the plane, where our sample settles to (where 
        the forces acting on our sample are at an equilibrium) is where a dot 
        representing our sample will be drawn. Depending on which class that 
        sample belongs it will be colored differently.
        """
        col_names = self._df.drop('Molecule', axis=1).columns.tolist()
        if category_column not in col_names:
            raise ValueError('{0} not in columns'.format(category_column))
        
        if columns and category_column not in columns:
            if all(isinstance(item, int) for item in columns):
                columns.append(col_names.index(category_column))
            else:
                columns.append(category_column) 
            
        df = self.get_table(rows, columns, filters)

        ax = radviz(df, category_column, s=point_size)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        #ax.set_frame_on(False)
        return ax
예제 #14
0
    def test_radviz(self):
        from pandas.tools.plotting import radviz
        from matplotlib import cm

        df = self.iris
        _check_plot_works(radviz, df, 'Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(radviz, df, 'Name', color=rgba)
        # skip Circle drawn as ticks
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches[:10],
                           facecolors=rgba,
                           mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        _check_plot_works(radviz, df, 'Name', color=cnames)
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10])

        _check_plot_works(radviz, df, 'Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10])

        colors = [[0., 0., 1., 1.], [0., 0.5, 1., 1.], [1., 0., 0., 1.]]
        df = DataFrame({
            "A": [1, 2, 3],
            "B": [2, 1, 3],
            "C": [3, 2, 1],
            "Name": ['b', 'g', 'r']
        })
        ax = radviz(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, facecolors=colors)
예제 #15
0
    def test_radviz(self):
        from pandas.tools.plotting import radviz
        from matplotlib import cm

        df = self.iris
        _check_plot_works(radviz, df, 'Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(radviz, df, 'Name', color=rgba)
        # skip Circle drawn as ticks
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches[:10], facecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        _check_plot_works(radviz, df, 'Name', color=cnames)
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10])

        _check_plot_works(radviz, df, 'Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10])

        colors = [[0., 0., 1., 1.],
                  [0., 0.5, 1., 1.],
                  [1., 0., 0., 1.]]
        df = DataFrame({"A": [1, 2, 3],
                        "B": [2, 1, 3],
                        "C": [3, 2, 1],
                        "Name": ['b', 'g', 'r']})
        ax = radviz(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, facecolors=colors)
예제 #16
0
    def Clonal_Evolution_Multidimensional_Data(self):
        i = 0.0
        Clonal_Evolution_df = pd.DataFrame()
        for df in DataStructs:
            if (i == 0):
                t = [i] * len(df)
                Clonal_Evolution_df = df
                Clonal_Evolution_df['t'] = pd.Series(
                    t, index=Clonal_Evolution_df.index)
            else:
                t = [i] * len(df)
                df['t'] = pd.Series(t, index=df.index)
                Clonal_Evolution_df = pd.concat([Clonal_Evolution_df, df],
                                                ignore_index=True)

            i = i + 1.0

        C = Clonal_Evolution_df['ID']
        S = Clonal_Evolution_df['Size']
        M = Clonal_Evolution_df['MR']
        P = Clonal_Evolution_df['PR']
        T = Clonal_Evolution_df['t']

        Normalised_df = pd.DataFrame(zip(T / max(T), S / max(S), P / max(P),
                                         M / max(M), C),
                                     columns=['t', 'Size', 'PR', 'MR', 'ID'])

        plt.figure()
        parallel_coordinates(Normalised_df, 'ID',
                             colormap='jet').set_title("PC Plot")
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_Parallel_Coords_Plot.eps',
                    format='eps',
                    dpi=1000)

        plt.figure()
        andrews_curves(Normalised_df, 'ID', colormap='jet')
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_Andrews_Curves_Plot.eps',
                    format='eps',
                    dpi=1000)

        plt.figure()
        radviz(Normalised_df, 'ID', colormap='jet')
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_RadViz_Plot.eps', format='eps', dpi=1000)
예제 #17
0
파일: kmeans_np.py 프로젝트: lulugyf/pycode
def t4(tp='r'):
    # 可视化  conda install pandas  多维数据 可视化
    # http://cloga.info/%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/2016/10/12/multivariate-data-visualization
    import pandas as pd
    import matplotlib.pyplot as plt
    data = pd.read_csv('file:///e:/stock/xx1')
    from pandas.tools.plotting import andrews_curves
    from pandas.tools.plotting import parallel_coordinates
    from pandas.tools.plotting import radviz

    plt.figure()
    if tp == 'r':
        radviz(data, 'Name')
    elif tp == 'a':
        andrews_curves(data, 'Name')
    elif tp == 'p':
        parallel_coordinates(data, 'Name')
    plt.show()
def radvizPlot(base, classe):
    plt.figure(figsize=(10, 8))
    ax = radviz(base, classe)
    ax.legend(loc='center left',
              bbox_to_anchor=(0, 1),
              fancybox=True,
              ncol=2,
              fontsize='x-small')
    plt.ylim([-2, 2])
    plt.show()
예제 #19
0
    def radial_plot(self,
                    data=None,
                    labels=None,
                    x_label=None,
                    y_label=None,
                    title=None):
        '''wrapper for pandas radviz'''
        # TODO set title, labels

        fig = radviz(data, labels, color=sns.color_palette())

        plt.show()
예제 #20
0
 def createRadViz(self,data,base_dir,fileName):
     from pandas.tools.plotting import radviz
     pdf = PdfPages(''.join([base_dir,fileName]))
     for cols in data.columns.values:
         if len(data[cols].value_counts()) <= 20 and len(data[cols].value_counts()) > 1:
             req_data = data._get_numeric_data()
             req_data[cols]= data[cols]
             fig = plt.figure()
             fig = radviz(req_data, cols)
             fig.set_title(''.join(["plot of radviz vis ", cols]))
             pdf.savefig(fig.get_figure())
     pdf.close()
예제 #21
0
def visualize(config):

    # Create various visualizations of the data, this would help to create a feature vector
    for dataset in config['datasets']:
        scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde')
        fig_name = dataset['name'] + '_scatter_matrix' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20,20))
        parallel_coordinates(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_parallel_coordinates' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20,20))
        radviz(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_radviz' + '.png'
        plt.savefig(fig_name)
        plt.close()

    return OK
예제 #22
0
def multidimensional_plots(df, target_name, maxevents=10000, standardize=False):

    # randomize the data frame order
    df_random = df.reindex(np.random.permutation(df.index))[:maxevents]
    
    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
예제 #23
0
def plot_radial(data_frame, class_name):
    plt.clf()
    radviz(data_frame, class_name)
    # plt.show(block=False)
    plt.title('Radial Plot')
    plt.savefig(join(Status.TEMP_DIR, Status.RADIAL_NAME))
예제 #24
0
import pandas as pd
#import warnings
#warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates
from pandas.tools.plotting import radviz

sns.set(style="white", color_codes=True)

iris = pd.read_csv("wine.csv")
#iris.head()
#print iris
#print iris["Wine"].value_counts()
#sns.jointplot(x="A1", y="A2", data=iris, kind='reg', size=6)

#sns.pairplot(iris, hue="Wine",size=1)

parallel_coordinates(iris, "Wine")
radviz(iris, "Wine")
sns.plt.show()
예제 #25
0
 def test_radviz_deprecated(self):
     df = self.iris
     with tm.assert_produces_warning(FutureWarning):
         plotting.radviz(frame=df, class_column='Name')
예제 #26
0
parallel_coordinates(df, "Survived")


# 8. A final multivariate visualization technique pandas
#     has is radviz.
#     In Radviz, each dimension in the dataset is represented by
#      a dimensional anchor, and each dimensional anchor is distributed
#       evenly on a unit circle. Each line in the data set corresponds
#       to a point in the projection, that is linked to every dimensional
#        anchor by a spring. Each spring’s stiffness corresponds to the
#        value for that particular thing in that particular dimension.
#         The position of the point is defined as the point in the 2D space
#          where the spring’s tension is minimum.

# 8.1 First with random data
radviz(df1, "a")   # There is no pulling anywhere
# 8.2 Then with titanic data
radviz(df, "Survived")


# 9. Next is t-sne
tsne= TSNE()
# 9.1 First t-sne of random data
tsne_results_random = tsne.fit_transform(df1.iloc[: , 1:], df1.iloc[: , 0:])
# 9.2 Next, t-sne of titanic data
tsne_results_titanic = tsne.fit_transform(df.iloc[:, 1:], df.iloc[:, 0])

# 10. Plot the two results
# 10.1 First deep copy of random data
df1_tsne = df1.iloc[:, 1:].copy()
# 10.2 Here is X-axis points
예제 #27
0
# create a plot for each metric
for metric in stats:
    print('Plotting metric: %s' % metric)
    df = df.sort(metric, ascending=True)
    fig = plt.figure(figsize=(18, 18))

    axes = plt.Axes(fig, [.2, .1, .7, .8])  # [left, bottom, width, height]
    fig.add_axes(axes)

    df[metric].plot(kind='barh', title=metric, alpha=0.7)
    plt.savefig('images/' + metric.replace(' ', '-'))


# create csv
df.to_csv('data/todomvc-metrics.csv')

# create radviz
from pandas.tools.plotting import radviz

df_rad = df[['Sum Logical SLOC',
            'Mean Cyclomatic Complexity',
            'Sum Halstead Time',
            'Mean Maintainability Index']]
df_rad['Name'] = df_rad.index.tolist()

fig = plt.figure(figsize=(18, 18))
ax = radviz(df_rad, 'Name')
legend = ax.legend(fontsize='xx-small', fancybox=True, ncol=3)
plt.setp(legend.get_title(), fontsize='xx-small')
plt.savefig('images/radviz')
예제 #28
0
from pandas.tools.plotting import radviz

data = pd.read_csv('data/iris.data')
plt.figure()
radviz(data, 'Name')
예제 #29
0
    df.to_csv(OUTPATH, sep='\t', index=False, header=False)

    print "Wrote dataset of %i instances and %i attributes to %s" % (df.shape + (OUTPATH,))

    with open('meta.json', 'w') as f:
        meta = {'feature_names': FEATURES, 'target_names': LABEL_MAP}
        json.dump(meta, f, indent=4)

    # Describe the dataset
    print df.describe()

    # Determine the shape of the data
    print "{} instances with {} features\n".format(*df.shape)

    # Determine the frequency of each class
    print df.groupby('label')['label'].count()

    # Create a scatter matrix of the dataframe features
    scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde')
    plt.show()

    # Parallel Coordinates
    plt.figure(figsize=(12,12))
    parallel_coordinates(df,'label')
    plt.show()

    # Radviz
    plt.figure(figsize=(12,12))
    radviz(df, 'label')
    plt.show()
예제 #30
0
ax.set_zlabel("Z Label")
ax.set_xticks(df.finalDate[x].apply(lambda d: d.strftime("%Y-%m-%d")).values)
plt.show()


#################Several plots in 2D
df.plot(subplots=True)
df.plot(x="finalDate", y="finalBalance")
df.hist(by=["windowSize", "trainingSize"])
df.boxplot("finalBalance", by=["windowSize", "trainingSize"])
scatter_matrix(df, alpha=0.2, diagonal="kde")
df.plot(x="finalDate", y="finalBalance", kind="kde")

parallel_coordinates(df, "windowSize")
autocorrelation_plot(df.finalBalance)
radviz(df, "finalBalance")
df.plot(colormap="jet")

#################More specific plots in 2D
f, (ax1, ax2) = plt.subplots(2, 3)
ax1[0].plot(df.groupby(["windowSize"]).mean()["finalBalance"])
ax1[0].set_title("Window Size Mean")
ax1[0].set_ylim((5000, 15000))

ax2[0].plot(df.groupby(["windowSize"]).sum()["finalBalance"])
ax2[0].set_title("Window Size Sum")

ax1[1].plot(df.groupby(["trainingSize"]).mean()["finalBalance"])
ax1[1].set_title("Training Size Mean")
ax1[1].set_ylim((5000, 15000))
def format_data():
    path = "data/Honoraires_totaux_des_professionnels_de_sante_par_departement_en_2013.xls"

    data = pd.read_excel(path, sheetname=[1, 2, 3, 4, 5, 6])

    df = data[2]
    df = df.replace('nc', np.nan).dropna()
    df = df[df['DEPARTEMENT'].str.contains('- ')]
    df = df[df['SPECIALISTES'].str.contains('- ')]
    df = df[df['EFFECTIFS'] > 0]

    dep = pd.DataFrame([[x] + x.split('- ') for x in df['DEPARTEMENT']],
                       columns=['DEPARTEMENT', 'num_dep', 'name_dep'])

    spec = pd.DataFrame([[x] + x.split('- ') for x in df['SPECIALISTES']],
                        columns=['SPECIALISTES', 'num_spec', 'name_spec'])

    df = pd.concat([df, dep, spec], axis=1, join='inner')

    df['num_dep'] = df['num_dep'].str.replace('^0', '').str.replace(
        'B', '.5').str.replace('A', '.25').astype('float')

    df['num_spec'] = df['num_spec'].replace('^0', '').astype('float')
    df['DEPASSEMENTS (euros)'] = df['DEPASSEMENTS (euros)'].astype('float')

    print(df.head())

    df['NOMBRE DE DEPASSEMENTS (/medecin)'] = df['NOMBRE DE DEPASSEMENTS'] / df['EFFECTIFS']
    df['DEPASSEMENTS (euros/medecin)'] = df['DEPASSEMENTS (euros)'] / df['EFFECTIFS']
    #df['DEPASSEMENT MOYEN (euros/medecin)'] = df['DEPASSEMENT MOYEN (euros)'] / df['EFFECTIFS']

    #------------ histograme 'DEPASSEMENTS (euros)' -------------------------------
    plt.figure()
    plt.title('on considere les DEPASSEMENTS totaux > 5e6€ comme outliers (on le retire)')
    df['DEPASSEMENTS (euros)'].plot(kind='hist', stacked=True, bins=100)
    plt.xlim(0, 6e6)

    #df = df[df['DEPASSEMENTS (euros)'] <= 5.e6]

    #--------------scatter_matrix -------------------------------------------
    dat = df[['num_spec', 'NOMBRE DE DEPASSEMENTS',
              'DEPASSEMENT MOYEN (euros)', 'DEPASSEMENTS (euros)', 'EFFECTIFS']]
    scatter_matrix(dat, diagonal='kde')

    # ---------- corrélations entre les variables, par numero de spécialité --------------
    dat = df[['num_spec', 'NOMBRE DE DEPASSEMENTS',
              'DEPASSEMENT MOYEN (euros)', 'DEPASSEMENTS (euros)', 'EFFECTIFS']]
    corr_mat = dat.corr()

    plt.figure()
    sns.heatmap(corr_mat, square=True)

    plt.figure()
    plt.title('Sans prendre en compte le departement.\n Un EFFECTIF plus faible semble corrélé  à des\n' +
              ' NOMBRE DE DEPASSEMENTS et DEPASSEMENTS (totaux) plus elevés.\n' + 'Il semble également que' +
              " les EFFECTIFS soient mal repartis d'une specialite à l'autre.")
    radviz(dat, 'num_spec')

    print(df.head())

    return df
예제 #32
0
파일: vizDr.py 프로젝트: dot2dotseurat/viz
def rad_viz(df,labels):
    fig = radviz(df, labels, color=sns.color_palette())
    plt.show()
df = pd.DataFrame(stats)

# create a plot for each metric
for metric in stats:
    df = df.sort(metric, ascending=True)
    fig = plt.figure()

    axes = plt.Axes(fig, [.2, .1, .7, .8])  # [left, bottom, width, height]
    fig.add_axes(axes)

    df[metric].plot(kind='barh', title=metric, alpha=0.7)
    plt.savefig('images/' + metric.replace(' ', '-'))


# create csv
df.to_csv('data/todomvc-metrics.csv')

# create radviz
from pandas.tools.plotting import radviz

df_rad = df[
    ['Sum Logical SLOC',
    'Mean Cyclomatic Complexity',
    'Sum Halstead Time',
    'Mean Maintainability Index']]
df_rad['Name'] = df_rad.index.tolist()

fig = plt.figure()
radviz(df_rad, 'Name')
plt.savefig('images/radviz')
예제 #34
0
life

# <codecell>

from pandas.tools.plotting import radviz

# <codecell>

#rn = range(0,56)
#for n in rn:
 #   print df.values[-n]
plt.figure()

# <codecell>

radviz(ang, '2010')

# <codecell>

pops = pd.read_csv('subsaharan_africa.csv', index_col=17,na_values=None)

# <codecell>

pops.columns

# <codecell>

from sklearn import cross_validation
pops = pops.dropna(axis=0)

# <codecell>
예제 #35
0
def v_radviz(sdf, features, target):
  radviz(sdf[features], target)
예제 #36
0
plt.savefig('databox.eps', format='eps', dpi=600)
plt.show()
setosa.boxplot()
plt.title('Iris-setosa boxplot')
plt.savefig('setobox.eps', format='eps', dpi=600)
plt.show()
versi.boxplot()
plt.title('Iris-versi boxplot')
plt.savefig('versibox.eps', format='eps', dpi=600)
plt.show()
verginica.boxplot()
plt.title('Iris-verginica boxplot')
plt.savefig('verginicabox.eps', format='eps', dpi=600)
plt.show()

andrews_curves(data, 'irisclass').legend(bbox_to_anchor=(0.4, 1))
plt.savefig('andrews_curve.eps', format='eps', dpi=600)
radviz(data, 'irisclass').legend(bbox_to_anchor=(1.1, 1))
plt.savefig('radviz.eps', format='eps', dpi=600)
plt.show()

parallel_coordinates(data, 'irisclass').legend(bbox_to_anchor=(1, 1))
plt.savefig('paracoor.eps', format='eps', dpi=600)
plt.show()

#plot scatter, correlation
sns.set(style="ticks")
sns.pairplot(data, hue="irisclass")
plt.savefig('scatermatrix.eps', format='eps', dpi=600)
plt.show()
예제 #37
0
'''データの読み込み。Windows環境はencoding='cp932'必須'''
iris = pd.read_csv('.//iris2.csv',index_col=0,encoding='cp932')

'''最初の5つを表示'''
print iris.head(n=5)

'''概要'''

iris_ab=iris.describe()
print iris_ab

'''相関係数行列'''
iris_co=iris.corr()
print iris_co


'''散布図行列'''
pd.scatter_matrix(iris,color='green',diagonal='kde',figsize=(6,6))

'''radviz'''
plt.figure()
radviz(iris,u"種類")

'''箱ひげ'''
iris.boxplot(by=u"種類")

plt.show()

iris_co.to_csv('iris_corr.csv')
예제 #38
0
def plot_radial(data_frame, class_name):
    plt.clf()
    radviz(data_frame, class_name)
    # plt.show(block=False)
    plt.title('Radial Plot')
    plt.savefig(join(Status.TEMP_DIR, Status.RADIAL_NAME))
예제 #39
0
파일: plot.py 프로젝트: YunfengHu/vixielab
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 12 09:48:33 2015

@author: dusty
"""

##---(Thu Feb 12 09:43:40 2015)--

from pandas.tools.plotting import radviz
import matplotlib.pyplot as plt
import pandas as pd
plt.figure()
data = pd.read_csv(
    '/Users/Dusty/Documents/Machine Learning/Vixie/etongueData.csv')
#data = data.drop('Sample', 1)
data = data[['Sample', 'SRS', 'GPS', 'STS', 'UMS', 'SPS', 'SWS', 'BRS']]
radviz(data, 'Sample')
plt.show()
예제 #40
0
sns.heatmap(
    train[columns].corr(), annot=True, cmap='cubehelix_r'
)  #draws  heatmap with input as the correlation matrix calculted by(iris.corr())
plt.show()

# <a id="6210"></a> <br>
# ### 6-2-10 radviz

# In[ ]:

from pandas.tools.plotting import radviz
columns = [
    'SalePrice', 'OverallQual', 'TotalBsmtSF', 'GrLivArea', 'GarageArea',
    'FullBath', 'YearBuilt', 'YearRemodAdd'
]
radviz(train[columns], "OverallQual")

# <a id="6212"></a> <br>
# ### 6-2-12 Factorplot

# In[ ]:

sns.factorplot('OverallQual', 'SalePrice', hue='Functional', data=train)
plt.show()

# <a id="63"></a> <br>
# ## 6-3 Data Preprocessing
# **Data preprocessing** refers to the transformations applied to our data before feeding it to the algorithm.
#
# Data Preprocessing is a technique that is used to convert the raw data into a clean data set. In other words, whenever the data is gathered from different sources it is collected in raw format which is not feasible for the analysis.
# there are plenty of steps for data preprocessing and we just listed some of them :
예제 #41
0
# between each pair of features
#
# From the pairplot, we'll see that the Iris-setosa species is separataed from the other
# two across all feature combinations
sb.pairplot(dataset.drop("Id", axis=1), hue="Species", size=3, diag_kind="kde")

# Now that we've covered seaborn, let's go back to some of the ones we can make with Pandas
# We can quickly make a boxplot with Pandas on each feature split out by species
dataset.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6))

# A final multivariate visualization technique pandas has is radviz
# Which puts each feature as a point on a 2D plane, and then simulates
# having each sample attached to those points through a spring weighted
# by the relative value for that feature
from pandas.tools.plotting import radviz
radviz(dataset.drop("Id", axis=1), "Species")

dataset.info()  #checking if there is any inconsistency in the dataset
#as we see there are no null values in the dataset, so the data can be processed

# convert Species column from catecorical to numerical values

from sklearn.preprocessing import LabelEncoder
numbers = LabelEncoder()
dataset['Species_encoded'] = numbers.fit_transform(dataset.Species)
y_train

# split data

X = dataset.iloc[:, :-2]
y = dataset.iloc[:, 6]
# parallel co ordinate plot
from pandas.tools.plotting import parallel_coordinates
plt.figure(figsize=(18,6))
parallel_coordinates(sampleData, '_user_id')


# andrews curves
from pandas.tools.plotting import andrews_curves
plt.figure(figsize=(18,6))
andrews_curves(sampleData,'_user_id')

# radviz spring constant plot
from pandas.tools.plotting import radviz
plt.figure(figsize=(12,10))
radviz(sampleData,"_user_id")

# As can be observed from above plots data is very closely spaced without any apparent linear or non-linear boundaries
# Initial inference - A tree based approach might work better when compared to a kernel based boundary fitting approach

# plotting predictor variables, not considering feat8 and feat16 here as majority of them would be replaced missing values
features= ["feat11","feat13","feat5","feat2","feat1","feat4","feat7","feat14","feat10","feat15","feat21","feat3",
           "feat18","feat9","feat17","feat20","feat6","feat12","feat19","feat22"]
workingData[features].hist()


# There are few features with outliers, however, I am not handling outliers as this is health data and anomalies might
#help classification accuracy


#============================================================================================================
예제 #43
0
파일: cars.py 프로젝트: jkmsmith/python
data = data.dropna()

data = data[data != 0]

#Get rid of incomplete values

data.origin.value_counts()
#245 American cars, 79 Asian cars, 70 European cars

data['model year'] = data['model year'] - 70 
#set years equal to number of years after 1970

data['horsepower'] = data['horsepower'].astype(int)

from pandas.tools.plotting import radviz
radviz(data.drop(["name","model year"], axis=1), "origin")
plt.show()
#American cars have much larger displacement than asian and european cars
#Asian and european cars tend to have higher acceleration and mpg compared to most
#american cars.

sns.violinplot(x = 'origin', y = 'mpg', data = data)
plt.show()

#American cars tend to have mpg about 15 mpg, european cars have about 25 mpg, and
#asian cars have about 32 mpg

sns.violinplot(x = 'origin', y = 'displacement', data = data)
plt.show()

#American engines are MUCH bigger than Asian and European engines, which are about the 
예제 #44
0
                                new_word = word["stem"] + suffix
                                if suffix == "NULL":
                                        new_word = word["stem"]
                                occur = word["instances"][new_word][0] * 1.0
                                if occur > maximum:
                                        maximum = occur
                                if occur < minimum:
                                        minimum = occur
                                values[suffix].append(occur)
                        for suffix in suffixes:
                                values[suffix].append(maximum)
                        for index in values:
                                if(min(values[index]) == max(values[index]) and len(values[index]) != 1):
                                        replace(values[index])
                        data = pd.DataFrame(values)
                        ax = radviz(data, "Name", color = colors)
                        ax.legend_.remove()
                        if test:
                                print data.iloc[[1]]
                                plt.show()
                print "done"
                plt.show() 

def normalize(array):
        maximum = max(array)
        return map(lambda x: x / maximum, array)

def generate_points_on_circle(num):
        arr = []
        for i in range(num):
                rad = i * 2 * math.pi / num
예제 #45
0
파일: code.py 프로젝트: sthambidurai/MLWine

# Determine the shape of the data
print("{} instances with {} features\n".format(*df_red.shape))

# Determine the frequency of each class
print(df_red.groupby('quality').count())


%matplotlib inline

# Create a scatter matrix of the dataframe features
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df_red, alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()


from pandas.tools.plotting import parallel_coordinates
plt.figure(figsize=(12,12))
parallel_coordinates(df_red, 'quality')
plt.show()


from pandas.tools.plotting import radviz
plt.figure(figsize=(12,12))
radviz(df_red, 'quality')
plt.show()

#next start the scikit learn portion. regression or
#good for this data 
 def test_radviz_deprecated(self, iris):
     with tm.assert_produces_warning(FutureWarning):
         plotting.radviz(frame=iris, class_column='Name')
# Seaborn package scatter plot that labels each species data with different colour
# used for idenitfying bivariate relationships
sns.FacetGrid(iris, hue="Species", size=5).map(plt.scatter, "SepalLengthCm", "SepalWidthCm").add_legend()

# Seaborn package Box plot
# useful for exploring univariate relationships
sns.boxplot(x="Species", y="PetalLengthCm", data=iris)

# Seaborn Kdeplot - plots density of feature measurements
# useful for exploring univariate relationships
sns.FacetGrid(iris, hue="Species", size=6).map(sns.kdeplot, "PetalLengthCm").add_legend()

# Pandas Radviz - puts each feature on 2D plane, based on spring tension minimization algorithm
# useful for multivariate visualization
from pandas.tools.plotting import radviz
radviz(iris.drop("Id", axis=1), "Species")

# Seaborn pairplot - plots all pairs of features
# useful for exploring bivariate relationships
sns.pairplot(iris.drop("Id", axis=1), hue="Species")










예제 #48
0
def plot_main():
    """ moyenne """
    fig, axes = plt.subplots(2, 4, figsize=(10, 16))
    values = pd.Series(df["moyenne"])
    df.boxplot(column="moyenne", by="classe", ax=axes[0][0])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    #    plt.savefig('kde_boxplot_moyenne.png')
    #    plt.close()
    pass
    """ ecart-type """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["ecart-type"])
    df.boxplot(column="ecart-type", by="classe", ax=axes[0][1])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_ecart-type.png')
    plt.close()
    pass
    """ mediane """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["mediane"])
    df.boxplot(column="mediane", by="classe", ax=axes[0][2])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_mediane.png')
    plt.close()
    pass
    """ entropie """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["entropie"])
    df.boxplot(column="entropie", by="classe", ax=axes[0][3])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_entropie.png')
    plt.close()
    pass
    """ uniformit """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["uniformit"])
    df.boxplot(column="uniformit", by="classe", ax=axes[1][0])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_uniformit.png')
    plt.close()
    pass
    """ surface """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["surface"])
    df.boxplot(column="surface", by="classe", ax=axes[1][1])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_surface.png')
    plt.close()
    pass
    """ eccent"""
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["eccentricity"])
    df.boxplot(column="eccentricity", by="classe", ax=axes[1][2])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_eccentricitie.png')
    plt.close()
    pass
    """ Andrews Curves & parallel coordinates ploting """

    fig, axes = plt.subplots(2, 1, figsize=(10, 17))
    andrews_curves(df, 'classe', ax=axes[0])
    parallel_coordinates(df, 'classe', ax=axes[1])
    plt.savefig('Andrew_curves_df2.png')

    plt.close()
    """ matrice des correlations  """
    fig, axes = plt.subplots(1, 1, figsize=(10, 10))
    scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde', ax=axes)
    plt.savefig('matrix_corr_kde.png')
    plt.close()
    pass
    """ RadViz visualizing multi-variate data"""
    fig, axes = plt.subplots(1, 1, figsize=(10, 10))
    radviz(df, "classe", ax=axes)
    plt.savefig('RadViz_df.png')
    plt.close()
예제 #49
0
             size=2,
             diag_kind="kde")

# In[ ]:

train[['var15', 'var36', 'logvar38', 'TARGET']].boxplot(by="TARGET",
                                                        figsize=(12, 6))

# In[ ]:

# A final multivariate visualization technique pandas has is radviz
# Which puts each feature as a point on a 2D plane, and then simulates
# having each sample attached to those points through a spring weighted
# by the relative value for that feature
from pandas.tools.plotting import radviz
radviz(train[['var15', 'var36', 'logvar38', 'TARGET']], "TARGET")

# # now look at all 8 features together

# In[ ]:

features

# In[ ]:

radviz(train[features + ['TARGET']], "TARGET")

# In[ ]:

sns.pairplot(train[features + ['TARGET']],
             hue="TARGET",
예제 #50
0
plt.show()
# The diagonal elements in a pairplot show the histogram by default
# We can update these elements to show other things, such as a kde
sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3, diag_kind="kde")
plt.show()

# Now that we've covered seaborn, let's go back to some of the ones we can make with Pandas
# We can quickly make a boxplot with Pandas on each feature split out by species
iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6))
plt.show()
# One cool more sophisticated technique pandas has available is called Andrews Curves
# Andrews Curves involve using attributes of samples as coefficients for Fourier series
# and then plotting these
from pandas.tools.plotting import andrews_curves
andrews_curves(iris.drop("Id", axis=1), "Species")
plt.show()
# Another multivariate visualization technique pandas has is parallel_coordinates
# Parallel coordinates plots each feature on a separate column & then draws lines
# connecting the features for each data sample
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(iris.drop("Id", axis=1), "Species")
plt.show()
# A final multivariate visualization technique pandas has is radviz
# Which puts each feature as a point on a 2D plane, and then simulates
# having each sample attached to those points through a spring weighted
# by the relative value for that feature
from pandas.tools.plotting import radviz
radviz(iris.drop("Id", axis=1), "Species")
plt.show()

 def test_radviz_deprecated(self, iris):
     with tm.assert_produces_warning(FutureWarning):
         plotting.radviz(frame=iris, class_column='Name')
예제 #52
0
#####################################################################
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates, scatter_matrix, radviz

iris   = load_iris()
df = pd.DataFrame(iris.data)
df.columns = iris.feature_names

# Boxplot
df.plot(kind='box')
plt.show()

# Histogram
df.plot(kind='hist')
plt.show()

# Radviz
df['name'] = iris.target
fig = radviz(df, 'name')
plt.show()

# Scatterplot Matrix
fig = scatter_matrix(df, alpha=0.2, diagonal='kde')
plt.show()

# Parallel Coordinates
fig = parallel_coordinates(df, 'name')
plt.show()
new_features = lda.fit(features, labels).transform(features)
print(new_features)


#Machine Learning begining
# Create a scatter matrix of the dataframe features
from pandas.tools.plotting import scatter_matrix
scatter_matrix(listing.dataframe, alpha=0.2, figsize=(12, 12), diagonal='kde')
plt.show()

from pandas.tools.plotting import parallel_coordinates
plt.figure(figsize=(12,12))
parallel_coordinates(listing.dataframe, 'accommodates')
plt.show()

from pandas.tools.plotting import radviz
plt.figure(figsize=(12,12))
radviz(listing.dataframe, 'id')
plt.show()

#Data Extraction
from sklearn.datasets.base import Bunch

DATA_DIR = os.path.abspath(os.path.join(".", "..", "team-zero", "data", "listing"))

# Show the contents of the data directory
for name in os.listdir(DATA_DIR):
    if name.startswith("."): continue
    print "- {}".format(name)

예제 #54
0
파일: vizmkr.py 프로젝트: rebeccabilbro/viz
def rad_viz(df,labels):
    fig = radviz(df, labels, color=sns.color_palette())
    plt.savefig('figures/rad_viz.png', transparent=True)