def test_andrews_curves(self):
        from pandas.tools.plotting import andrews_curves
        from matplotlib import cm

        df = self.iris

        _check_plot_works(andrews_curves, frame=df, class_column='Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba)
        self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames)
        self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])

        ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])

        colors = ['b', 'g', 'r']
        df = DataFrame({"A": [1, 2, 3],
                        "B": [1, 2, 3],
                        "C": [1, 2, 3],
                        "Name": colors})
        ax = andrews_curves(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, linecolors=colors)

        with tm.assert_produces_warning(FutureWarning):
            andrews_curves(data=df, class_column='Name')
示例#2
0
def irisVisualization():
    sns.set(style="white", color_codes=True)
    irisdata = load_iris()
    iris = pd.DataFrame(irisdata.data, columns=irisdata.feature_names)
    iris['Species'] = pd.Categorical.from_codes(irisdata.target,
                                                irisdata.target_names)
    # sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
    # pandas plot
    # iris.plot(kind="scatter", x="sepal length (cm)", y="sepal width (cm)")
    # iris.boxplot(by="Species", figsize=(12, 6))
    from pandas.tools.plotting import andrews_curves, parallel_coordinates
    andrews_curves(iris, "Species")
    parallel_coordinates(iris, "Species")

    # seaborn plot
    sns.jointplot(x="sepal length (cm)",
                  y="sepal width (cm)",
                  data=iris,
                  size=5)
    sns.FacetGrid(iris, hue="Species", size=5) \
        .map(plt.scatter, "sepal length (cm)", "sepal width (cm)").add_legend() # sns.kdeplot
    sns.boxplot(x="Species", y="sepal length (cm)", data=iris)
    sns.violinplot(x="Species", y="sepal length (cm)", data=iris, size=6)
    sns.pairplot(iris, hue="Species", size=3)
    sns.plt.show()
示例#3
0
def multidimensional_plots(df,
                           target_name,
                           maxevents=10000,
                           standardize=False):

    # randomize the data frame order
    df_random = df.reindex(np.random.permutation(df.index))[:maxevents]

    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random,
                         target_name,
                         ax=current_axis,
                         colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
示例#4
0
    def test_andrews_curves(self):
        from pandas.tools.plotting import andrews_curves
        from matplotlib import cm

        df = self.iris

        _check_plot_works(andrews_curves, df, 'Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(andrews_curves, df, 'Name', color=rgba)
        self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(andrews_curves, df, 'Name', color=cnames)
        self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])

        ax = _check_plot_works(andrews_curves, df, 'Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])

        colors = ['b', 'g', 'r']
        df = DataFrame({"A": [1, 2, 3],
                        "B": [1, 2, 3],
                        "C": [1, 2, 3],
                        "Name": colors})
        ax = andrews_curves(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, linecolors=colors)

        with tm.assert_produces_warning(FutureWarning):
            andrews_curves(data=df, class_column='Name')
示例#5
0
def multidimensional_plots(df, target_name, maxevents=10000):
    # normalize
    df_std = (df - df.mean()) / df.std()
    # put the unnormalized target back
    df_std[target_name] = df[target_name]
    # randomize the data frame order
    df_random = df_std.reindex(np.random.permutation(df_std.index))

    # make sure this doesn't take too long
    if df_random.shape[0] > maxevents:
        df_random = df_random[:maxevents]

    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random,
                         target_name,
                         ax=current_axis,
                         colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
示例#6
0
def multidimensional_plots(df, target_name, maxevents=10000):
    # normalize
    df_std = (df - df.mean())/df.std()
    # put the unnormalized target back
    df_std[target_name] = df[target_name]
    # randomize the data frame order
    df_random = df_std.reindex(np.random.permutation(df_std.index))

    # make sure this doesn't take too long
    if df_random.shape[0] > maxevents:
        df_random = df_random[:maxevents]
    
    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
示例#7
0
def andrew_curves():
    plt_feat =  ['Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols', 
                 'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity', 
                 'Hue', 'OD280/OD315', 'Proline']
    plt_feat1 =  ['MalicAcid', 'Ash', 'OD280/OD315', 'Magnesium','TotalPhenols']
    data_norm = pd.concat([X_norm[plt_feat1], y], axis=1)
    andrews_curves(data, 'Class')
    plt.show()
示例#8
0
def draw_curve_plot2(dframe, cid, behav):
    fig, (ax1) = plt.subplots(nrows=1, ncols=1)
    # Andrews' curves
    andrews_curves(dframe, 'avgIC', ax=ax1)
    plt.legend(loc='best')
    graphTitle = 'Child: ' + str(cid)
    graphTitle += ', Behaviour:' + behav
    plt.title(graphTitle)
    plt.xlabel('session number')
    plt.ylabel('average correct answers')
    mng = plt.get_current_fig_manager()
    mng.resize(*mng.window.maxsize())
    plt.show()
示例#9
0
def module3():
    """
    Notes on module 3
    """

    # The Seven Basic Tools of Quality: https://en.wikipedia.org/wiki/Seven_Basic_Tools_of_Quality

    #Histogram
    path = "C:/Users/jbennett02/Documents/Magic Briefcase/classwork/edx/Microsoft/DAT210x.b/module3/Datasets/"
    df = pd.read_csv(path + "wheat.data")
    matplotlib.style.use('ggplot')  # Look Pretty
    df.asymmetry.plot.hist(title='Asymmetry', bins=10)
    plt.show()

    #2D scatterplot
    df.plot.scatter(x='area', y='perimeter')
    plt.show()

    #3D scatterplot
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.set_xlabel('area')
    ax.set_ylabel('perimeter')
    ax.set_zlabel('asymmetry')
    ax.scatter(df.area, df.perimeter, df.asymmetry, c='r', marker='.')
    plt.show()

    #Parallel Coordinates -- higher dimensionality visualizations
    data = load_iris()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target_names'] = [data.target_names[i] for i in data.target]
    # Parallel Coordinates Start Here:
    plt.figure()
    parallel_coordinates(df, 'target_names')
    plt.show()

    #Andrews curve
    plt.figure()
    andrews_curves(df, 'target_names')
    plt.show()

    #correlation plot
    df = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    print(df.corr())
    plt.imshow(df.corr(), cmap=plt.cm.Blues, interpolation='nearest')
    plt.colorbar()
    tick_marks = [i for i in range(len(df.columns))]
    plt.xticks(tick_marks, df.columns, rotation='vertical')
    plt.yticks(tick_marks, df.columns)
示例#10
0
def draw_curve_plot(dframe, cid, behav):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1)
    # Andrews' curves
    andrews_curves(df, 'is_correct', ax=ax1)
    # multiline plot with group by
    for key, grp in dframe.groupby(['session']):
        ax2.plot(grp['date'], grp['answering_time'], label="")
    plt.legend(loc='best')
    graphTitle = 'Child: ' + str(cid)
    graphTitle += ', Behaviour:' + behav
    plt.title(graphTitle)
    plt.xlabel('date')
    plt.ylabel('answering time')
    mng = plt.get_current_fig_manager()
    mng.resize(*mng.window.maxsize())
    plt.show()
示例#11
0
    def Clonal_Evolution_Multidimensional_Data(self):
        i = 0.0
        Clonal_Evolution_df = pd.DataFrame()
        for df in DataStructs:
            if (i == 0):
                t = [i] * len(df)
                Clonal_Evolution_df = df
                Clonal_Evolution_df['t'] = pd.Series(
                    t, index=Clonal_Evolution_df.index)
            else:
                t = [i] * len(df)
                df['t'] = pd.Series(t, index=df.index)
                Clonal_Evolution_df = pd.concat([Clonal_Evolution_df, df],
                                                ignore_index=True)

            i = i + 1.0

        C = Clonal_Evolution_df['ID']
        S = Clonal_Evolution_df['Size']
        M = Clonal_Evolution_df['MR']
        P = Clonal_Evolution_df['PR']
        T = Clonal_Evolution_df['t']

        Normalised_df = pd.DataFrame(zip(T / max(T), S / max(S), P / max(P),
                                         M / max(M), C),
                                     columns=['t', 'Size', 'PR', 'MR', 'ID'])

        plt.figure()
        parallel_coordinates(Normalised_df, 'ID',
                             colormap='jet').set_title("PC Plot")
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_Parallel_Coords_Plot.eps',
                    format='eps',
                    dpi=1000)

        plt.figure()
        andrews_curves(Normalised_df, 'ID', colormap='jet')
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_Andrews_Curves_Plot.eps',
                    format='eps',
                    dpi=1000)

        plt.figure()
        radviz(Normalised_df, 'ID', colormap='jet')
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_RadViz_Plot.eps', format='eps', dpi=1000)
示例#12
0
def t4(tp='r'):
    # 可视化  conda install pandas  多维数据 可视化
    # http://cloga.info/%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/2016/10/12/multivariate-data-visualization
    import pandas as pd
    import matplotlib.pyplot as plt
    data = pd.read_csv('file:///e:/stock/xx1')
    from pandas.tools.plotting import andrews_curves
    from pandas.tools.plotting import parallel_coordinates
    from pandas.tools.plotting import radviz

    plt.figure()
    if tp == 'r':
        radviz(data, 'Name')
    elif tp == 'a':
        andrews_curves(data, 'Name')
    elif tp == 'p':
        parallel_coordinates(data, 'Name')
    plt.show()
 def plot_regression(self):
     # sample plot of points
     x = self.filtered.tmax
     y = self.filtered.tmin
     fit = polyfit(x,y,1)
     fit_fn = poly1d(fit)
     # takes in x and returns an estimate for y
     # fig 1
     plt.plot(x,y, 'yo', x, fit_fn(x), '--k')
     plt.title("Temperature pattern regression plot")
     plt.xlabel("tmax")
     plt.ylabel("tmin")
     # fig 2
     plt.figure()
     andrews_curves(self.filtered[[2,3]], 'tmin')
     plt.title("Andrews curve plot for tmin")
     plt.show()
     # fig 3
     plt.figure()
     self.df[[2,3]].boxplot()
     # fig 4
     self.filtered.plot()
     plt.show()
示例#14
0
def multidimensional_plots(df, target_name, maxevents=10000, standardize=False):

    # randomize the data frame order
    df_random = df.reindex(np.random.permutation(df.index))[:maxevents]
    
    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
示例#15
0
 def plot_regression(self):
     # sample plot of points
     x = self.filtered.tmax
     y = self.filtered.tmin
     fit = polyfit(x, y, 1)
     fit_fn = poly1d(fit)
     # takes in x and returns an estimate for y
     # fig 1
     plt.plot(x, y, 'yo', x, fit_fn(x), '--k')
     plt.title("Temperature pattern regression plot")
     plt.xlabel("tmax")
     plt.ylabel("tmin")
     # fig 2
     plt.figure()
     andrews_curves(self.filtered[[2, 3]], 'tmin')
     plt.title("Andrews curve plot for tmin")
     plt.show()
     # fig 3
     plt.figure()
     self.df[[2, 3]].boxplot()
     # fig 4
     self.filtered.plot()
     plt.show()
示例#16
0
def Evt_Multi_D_Andrew_Plot(self, event):
    page = self.New_Tab.GetSelection()
    panel = self.New_Tab.GetPage(page)
    self.selected_checkbox()
    panel.canvas.figure.clf()
    data_list = list()
    for variable in self.selected_checkboxes:
        data_list.append(variable[1])

    data_list.append("customer_number")
    data = self.data[data_list][self.minimum:self.maximum]
    ax = andrews_curves(data, "customer_number")
    for direction in ["left", "right", "top", "bottom"]:
        ax.spines[direction].set_color("none")
    panel.canvas.draw()
    return
def Evt_Multi_D_Andrew_Plot(self, event):
	page = self.New_Tab.GetSelection()  
	panel = self.New_Tab.GetPage(page)
	self.selected_checkbox()
	panel.canvas.figure.clf()
	data_list = list()
	for variable in self.selected_checkboxes:
		data_list.append(variable[1])

	data_list.append("customer_number")
	data = self.data[data_list][self.minimum: self.maximum]
	ax= andrews_curves(data, "customer_number")
	for direction in ["left", "right", "top", "bottom"]:
		ax.spines[direction].set_color("none")
	panel.canvas.draw()
	return
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from pandas.tools.plotting import andrews_curves

matplotlib.style.use('ggplot')

seeds_dataset = pd.read_csv('Datasets/wheat.data')

seeds_dataset1 = seeds_dataset.drop(labels=['id', 'area', 'perimeter'],
                                    axis='columns')
seeds_dataset2 = seeds_dataset.drop(labels='id', axis='columns')

plt.figure()
andrews_curves(seeds_dataset1, 'wheat_type')
plt.show()

plt.figure()
andrews_curves(seeds_dataset2, 'wheat_type')
plt.show()
示例#19
0
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

# Look pretty...
matplotlib.style.use('ggplot')
# If the above line throws an error, use plt.style.use('ggplot') instead

# Load up SKLearn's Iris Dataset into a Pandas Dataframe
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target_names'] = [data.target_names[i] for i in data.target]

# Andrews Curves Start Here:
plt.figure()
andrews_curves(df, 'target_names')
plt.show()



#
#   imshow
#
import matplotlib.pyplot as plt
import numpy as np
import random

random.seed(1)
df = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
df.corr()
示例#20
0
	YL = np.asarray(YL)
	YT = np.asarray(YT)
	YL[YL!='0']='2'
	YT[YT!='0']='2'
	treeclf = RandomForestClassifier(n_estimators=40, criterion='entropy', max_features='auto', bootstrap=True, oob_score=True, n_jobs=2, class_weight="balanced", random_state=42)
	treeclf.fit(XL, YL)

	learnguesses = [treeclf.predict(chunk) for chunk in Xlearn2]
	testguesses = [treeclf.predict(chunk) for chunk in Xtest]
	devguesses = [treeclf.predict(chunk) for chunk in Xdev]

	# p=1 # make 100% of the data noise
	# learnguesses, testguesses, devguesses = simulate_data(p)

	CXlearn = get_crf_data(learnguesses, Xlearn2, Nlearn2, Plearn2)
	CXtest = get_crf_data(testguesses, Xtest, Ntest, Ptest)
	CXdev = get_crf_data(devguesses, Xdev, Ndev, Pdev)
	clf.fit(CXlearn, Ylearn2, CXdev, Ydev)
	CYhat = clf.predict(CXtest)
	print "======CRF PERFORMANCE======"
	print_performance(lsum(Ytest), lsum(CYhat))

	# joblib.dump(clf, "Model/model.pkl")

if visualize:
	scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='none');
	plt.figure()
	andrews_curves(df, 'Class')
	plt.show()

joblib.dump(treeclf, "Model/treemodel.pkl")
for userid in pd.unique(workingData.loc[:,"_user_id"]):
    sampleCount=int(.3 * workingData.loc[workingData._user_id==userid,"_user_id"].count())
    df=workingData[workingData._user_id==userid].sample(sampleCount)
    sampleData=sampleData.append(df)
sampleData.info()

# parallel co ordinate plot
from pandas.tools.plotting import parallel_coordinates
plt.figure(figsize=(18,6))
parallel_coordinates(sampleData, '_user_id')


# andrews curves
from pandas.tools.plotting import andrews_curves
plt.figure(figsize=(18,6))
andrews_curves(sampleData,'_user_id')

# radviz spring constant plot
from pandas.tools.plotting import radviz
plt.figure(figsize=(12,10))
radviz(sampleData,"_user_id")

# As can be observed from above plots data is very closely spaced without any apparent linear or non-linear boundaries
# Initial inference - A tree based approach might work better when compared to a kernel based boundary fitting approach

# plotting predictor variables, not considering feat8 and feat16 here as majority of them would be replaced missing values
features= ["feat11","feat13","feat5","feat2","feat1","feat4","feat7","feat14","feat10","feat15","feat21","feat3",
           "feat18","feat9","feat17","feat20","feat6","feat12","feat19","feat22"]
workingData[features].hist()

import matplotlib

# Look pretty...
matplotlib.style.use('ggplot')
# If the above line throws an error, use plt.style.use('ggplot') instead

# Load up SKLearn's Iris Dataset into a Pandas Dataframe
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)

df['target_names'] = [data.target_names[i] for i in data.target]

# Parallel Coordinates Start Here:
plt.figure()
parallel_coordinates(df, 'target_names')
plt.show()

# Andrews Curves Start Here:
plt.figure()
andrews_curves(df, 'target_names')
plt.show()

# correlation matrix
# df.corr()
plt.imshow(df.corr(), cmap=plt.cm.Blues, interpolation='nearest')
plt.colorbar()
tick_marks = [i for i in range(len(df.columns))]
plt.xticks(tick_marks, df.columns, rotation='vertical')
plt.yticks(tick_marks, df.columns)

plt.show()
示例#23
0
def plot_andrews(data_frame, class_name):
    plt.clf()
    andrews_curves(data_frame, class_name)
    plt.title('Andrews Curve')
    # plt.show(block=False)
    plt.savefig(join(Status.TEMP_DIR, Status.ANDREWS_NAME))
示例#24
0
import matplotlib.pyplot as plt
from pandas import DataFrame
from pandas.tools.plotting import andrews_curves
from pandas.tools.plotting import parallel_coordinates
from pandas.tools.plotting import scatter_matrix


# Load dataset from the sklearn
iris_data = load_iris()

# Concantenate dataset to dataframe
iris_cat = np.concatenate((iris_data.data, iris_data.target.reshape(150,1)), axis=1)
iris_df = DataFrame(iris_cat, columns=['PA', 'PB', 'PC', 'PD','Name'])

# Plot the data using
# 1 - Parallel Coordinates
plt.figure()
parallel_coordinates(iris_df, 'Name')

# 2 - Andrews Curves
plt.figure()
andrews_curves(iris_df, 'Name')

# 3 - Scatter_Plots
plt.figure()
scatter_matrix(iris_df, alpha=0.2, figure=(6,6), diagonal='kde')

# Show the plot
plt.show()

 def display_andrews_graph(self):
     pdplt.andrews_curves(df, "output", ax=None)
     plt.show()
示例#26
0
 def time_plot_andrews_curves(self):
     andrews_curves(self.df, "Name")
示例#27
0
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from pandas.tools.plotting import andrews_curves

# Look pretty...
matplotlib.style.use('ggplot')

#
# TODO: Load up the Seeds Dataset into a Dataframe
# It's located at 'Datasets/wheat.data'
#
df = pd.read_csv('Datasets/wheat.data', index_col=0)

#
# TODO: Drop the 'id', 'area', and 'perimeter' feature
#
#df.drop(['area', 'perimeter'], axis=1, inplace=True)

#
# TODO: Plot a parallel coordinates chart grouped by
# the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
#
plt.figure()
andrews_curves(df, 'wheat_type')

plt.show()
# It's located at 'Datasets/wheat.data'
# 
# .. your code here ..
df = pd.read_csv('Datasets/wheat.data')


#
# TODO: Drop the 'id', 'area', and 'perimeter' feature
# 
# .. your code here ..
df = df.drop(labels=['id'], axis = 1)


#
# TODO: Plot an Andrews Curve grouped by the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
# 
# .. your code here ..
plt.figure()
andrews_curves(df, 'wheat_type', alpha = 0.4)

plt.show()

# Questions:
# Are your outlier samples still easily identifiable in the plot?
# No
#
# After adding in the area and perimeter features, does your plot suffer from the same feature scaling issue you had with parallel
# coordinates?
# No
sns.pairplot(iris,hue='Species',kind='reg')

# 12. Heatmap
sns.heatmap(iris.corr(),linewidth=0.3,vmax=1.0,square=True, linecolor='black',annot=True)

# 13 Boxplot
sns.boxplot(x='Species', y = 'SepalLength', data=iris)

# 14. Kdeplot
sns.FacetGrid(iris,hue='Species',size=4) \
   .map(sns.kdeplot,'SepalLength') \
   .add_legend()

# Andrews Curve
from pandas.tools.plotting import andrews_curves
andrews_curves(iris.drop("Id", axis=1), 'Species')

# radviz
from pandas.tools.plotting import radviz
radviz(iris.drop("Id", axis=1), 'Species')










示例#30
0


#
# TODO: Drop the 'id', 'area', and 'perimeter' feature
# 
# .. your code here ..
df0 = df.drop(['id', 'area', 'perimeter'], axis = 1)


#
# TODO: Plot a parallel coordinates chart grouped by
# the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
# 
# .. your code here ..
plt.figure()
parallel_coordinates(df0, 'wheat_type', alpha = 0.4)
plt.show()


plt.figure()
andrews_curves(df0, 'wheat_type')
plt.show()

df1 = df.drop(['id'], axis = 1)
plt.figure()
andrews_curves(df1, 'wheat_type')
plt.show()

df1 = pd.DataFrame({ 'a' : [0,1] * 250,
                    'b' : np.random.randn(500),
                    'c' : np.random.randn(500),
                    'd' : np.random.randn(500),
                    'e' : np.random.randn(500),
                    'f' : np.random.randn(500),
                    'g' : np.random.randn(500),
                    'h' : np.random.randn(500),
                    'i' : np.random.randn(500),
                    'j' : np.random.randn(500),
                    'k' : np.random.randn(500),
                    'l' : np.random.randn(500),
                    'm' : np.random.randn(500) })

# 6.3 Plot Andrews curve. Merely a jumble of lines
andrews_curves(df1, "a")

# 6.4 Plot now for titanic data. There appears to be a structure in that
#     there is clear separation between yellow and green lines
andrews_curves(df, "Survived")


# 7.1 Draw parallel coordinates
# 7.2 First with random data
parallel_coordinates(df1, "a")
# 7.3 Next with titanic data
#     Presence of structure is evident
parallel_coordinates(df, "Survived")


# 8. A final multivariate visualization technique pandas
示例#32
0
def plot_andrews(data_frame, class_name):
    plt.clf()
    andrews_curves(data_frame, class_name)
    plt.title('Andrews Curve')
    # plt.show(block=False)
    plt.savefig(join(Status.TEMP_DIR, Status.ANDREWS_NAME))
示例#33
0
# TODO: Plot a parallel coordinates chart grouped by
# the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
# 
# .. your code here ..
parallel_coordinates(w2, "wheat_type", alpha = 0.4)  


plt.show()


# Andrews curves (assignment 5)

from pandas.tools.plotting import andrews_curves

andrews_curves(w2, "wheat_type", alpha = 0.4)

w3 = w2.copy()
w3["parameter"] = wheatData.perimeter
w3["area"] = wheatData.area

andrews_curves(w3, "wheat_type", alpha = 0.4)








示例#34
0
import matplotlib.pyplot as plt
import matplotlib

from pandas.tools.plotting import andrews_curves

# Look pretty...
matplotlib.style.use('ggplot')


seeds = pd.read_csv("Datasets/wheat.data", index_col=0)





#seeds=seeds.drop(labels=['area','perimeter'],axis=1)

#
# TODO: Plot a parallel coordinates chart grouped by
# the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
# 
# .. your code here ..

plt.figure(dpi=267)
andrews_curves(seeds, 'wheat_type',alpha=4)

plt.show()


示例#35
0
data.columns = ["age","bmi","chol"]
data.sort(["age"],inplace=True)
print data.corr(method="pearson")

#%%
line, = plt.plot(data.age, data.chol, "m-", linewidth = 5.0)
#line.set_antialiased(False)
plt.setp(line)
plt.xlabel("age")
plt.ylabel("chol")
plt.show()

#ax = plt.subplot(111)
#t = np.arange(0.0, 5.0, 0.01)
#s = np.cos(2*np.pi*t)
#line, = plt.plot(t, s, lw=2)
#plt.annotate('local max', xy=(2, 1), xytext=(3, 1.5),
#            arrowprops=dict(facecolor='black', shrink=0.05),
#            )
#plt.ylim(-2,2)
#plt.show()
#print plt.style.available

#%% 
#data.plot(y = data.index)
from pandas.tools.plotting import andrews_curves
data = pd.read_csv("https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv")

#%%
andrews_curves(data, "Name")
示例#36
0
def plot_main():
    """ moyenne """
    fig, axes = plt.subplots(2, 4, figsize=(10, 16))
    values = pd.Series(df["moyenne"])
    df.boxplot(column="moyenne", by="classe", ax=axes[0][0])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    #    plt.savefig('kde_boxplot_moyenne.png')
    #    plt.close()
    pass
    """ ecart-type """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["ecart-type"])
    df.boxplot(column="ecart-type", by="classe", ax=axes[0][1])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_ecart-type.png')
    plt.close()
    pass
    """ mediane """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["mediane"])
    df.boxplot(column="mediane", by="classe", ax=axes[0][2])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_mediane.png')
    plt.close()
    pass
    """ entropie """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["entropie"])
    df.boxplot(column="entropie", by="classe", ax=axes[0][3])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_entropie.png')
    plt.close()
    pass
    """ uniformit """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["uniformit"])
    df.boxplot(column="uniformit", by="classe", ax=axes[1][0])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_uniformit.png')
    plt.close()
    pass
    """ surface """
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["surface"])
    df.boxplot(column="surface", by="classe", ax=axes[1][1])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_surface.png')
    plt.close()
    pass
    """ eccent"""
    fig, axes = plt.subplots(2, 1, figsize=(10, 16))
    values = pd.Series(df["eccentricity"])
    df.boxplot(column="eccentricity", by="classe", ax=axes[1][2])
    values.hist(color='g', ax=axes[1], normed=True)
    values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True)
    plt.savefig('kde_boxplot_eccentricitie.png')
    plt.close()
    pass
    """ Andrews Curves & parallel coordinates ploting """

    fig, axes = plt.subplots(2, 1, figsize=(10, 17))
    andrews_curves(df, 'classe', ax=axes[0])
    parallel_coordinates(df, 'classe', ax=axes[1])
    plt.savefig('Andrew_curves_df2.png')

    plt.close()
    """ matrice des correlations  """
    fig, axes = plt.subplots(1, 1, figsize=(10, 10))
    scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde', ax=axes)
    plt.savefig('matrix_corr_kde.png')
    plt.close()
    pass
    """ RadViz visualizing multi-variate data"""
    fig, axes = plt.subplots(1, 1, figsize=(10, 10))
    radviz(df, "classe", ax=axes)
    plt.savefig('RadViz_df.png')
    plt.close()
示例#37
0
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 08 12:56:53 2016

@author: paul.buxton
"""

from sklearn.datasets import load_iris
from pandas.tools.plotting import andrews_curves

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

# Look pretty...
matplotlib.style.use('ggplot')
# If the above line throws an error, use plt.style.use('ggplot') instead

# Load up SKLearn's Iris Dataset into a Pandas Dataframe
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target_names'] = [data.target_names[i] for i in data.target]

# Andrews Curves Start Here:
plt.figure()
andrews_curves(df, 'target_names')
plt.show()
示例#38
0
sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)
plt.show()
# The diagonal elements in a pairplot show the histogram by default
# We can update these elements to show other things, such as a kde
sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3, diag_kind="kde")
plt.show()

# Now that we've covered seaborn, let's go back to some of the ones we can make with Pandas
# We can quickly make a boxplot with Pandas on each feature split out by species
iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6))
plt.show()
# One cool more sophisticated technique pandas has available is called Andrews Curves
# Andrews Curves involve using attributes of samples as coefficients for Fourier series
# and then plotting these
from pandas.tools.plotting import andrews_curves
andrews_curves(iris.drop("Id", axis=1), "Species")
plt.show()
# Another multivariate visualization technique pandas has is parallel_coordinates
# Parallel coordinates plots each feature on a separate column & then draws lines
# connecting the features for each data sample
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(iris.drop("Id", axis=1), "Species")
plt.show()
# A final multivariate visualization technique pandas has is radviz
# Which puts each feature as a point on a 2D plane, and then simulates
# having each sample attached to those points through a spring weighted
# by the relative value for that feature
from pandas.tools.plotting import radviz
radviz(iris.drop("Id", axis=1), "Species")
plt.show()
示例#39
0
 def time_plot_andrews_curves(self):
     andrews_curves(self.df, "Name")
示例#40
0
    if (Parser().Get_TEM_File() and Surviving_Phylogeny):
        print "Phylogeny "
        Parser().Read_Tumour_Evolution_File(False, 2000)
        #Parser().Filtered_Clonal_Value(CE_Frequency_Filter)
        Parser().Raw_Clonal_Values()
        Parser().Surviving_Phylogenetic_Tree()

    #Phylogenetic treee reconstructions
    if (Parser().Get_TEM_File() and False):
        Parser().Read_Tumour_Evolution_File(False, 2000)
        Parser().Filtered_Clonal_Value(0.1)
        #print Tumour_Evolution.keys()
        #print DataStructs[len(DataStructs)-1]
        #P_H =  Tumour_Evolution['P-0:0'][3]
        #print P_H[0], P_H[len(P_H)-1]
        print "PC Ploting"

    # Ploting final population parallel coordinates
    if (Parser().Valid_Final_Population_File() and False):
        Final_Population_df = Parser().Read_Final_Population_File_Top_values(1)
        #Final_Population_df = Parser().Read_Final_Population_File()
        Final_Population_df = Parser().Normalise_df_axis()
        #print Final_Population_df
        print "Ploting", len(Final_Population_df)
        plt.figure()
        #parallel_coordinates(Final_Population_df, 'Extinct', color=['blue','black','red']).set_title("PC Plot")
        andrews_curves(Final_Population_df, 'Extinct', colormap='jet')
        plt.show()
        print "Done"
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd 
from pandas import Series, DataFrame
from pandas.tools.plotting import andrews_curves
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates
iris=load_iris()

df_iris=pd.DataFrame(iris['data'],columns=['sepallength','sepalwidth','petallength','petalwidth'])

df_iris['target']=iris['target']
print(df_iris.head())

print(df_iris['target'].value_counts())
df_iris.plot(kind='scatter',x='sepallength',y='sepalwidth')
plt.show()
sns.jointplot(x='sepallength',y='sepalwidth', data=df_iris, size=5)
sns.FacetGrid(df_iris,hue='target',size=5).map(plt.scatter,'sepallength','sepalwidth').add_legend()
sns.boxplot(x='target',y='sepallength',data=df_iris)
ax = sns.boxplot(data=df_iris, x = 'target',y = 'sepallength')
ax = sns.stripplot(data=df_iris, x='target', y='sepallength', jitter=True, edgecolor='green')
sns.violinplot(x='target',y='sepallength',data=df_iris,size=5)
sns.FacetGrid(df_iris,hue='target',size=5).map(sns.kdeplot,'sepallength').add_legend()
sns.pairplot(df_iris,hue='target',size=4)
sns.pairplot(df_iris,hue='target',size=4,diag_kind='kde')
df_iris.boxplot(by='target',figsize=(20,10))
andrews_curves(df_iris,'target')
parallel_coordinates(df_iris,'target')
示例#42
0
# Look pretty...
matplotlib.style.use('ggplot')


#
# Load up the Seeds Dataset into a Dataframe
# It's located at 'Datasets/wheat.data'
# 
path = "C:/Users/jbennett02/Documents/Magic Briefcase/classwork/edx/Microsoft/DAT210x.b/module3/Datasets/"
df = pd.read_csv(path + "wheat.data")

#
# Drop the 'id', 'area', and 'perimeter' feature
# 
df.drop('id', axis=1, inplace=True)
#df.drop('area', axis=1, inplace=True)
#df.drop('perimeter', axis=1, inplace=True)



#
# Plot an Andrews curve grouped by
# the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
# 
plt.figure()
andrews_curves(df, 'wheat_type')


plt.show()
示例#43
0
plt.savefig('databox.eps', format='eps', dpi=600)
plt.show()
setosa.boxplot()
plt.title('Iris-setosa boxplot')
plt.savefig('setobox.eps', format='eps', dpi=600)
plt.show()
versi.boxplot()
plt.title('Iris-versi boxplot')
plt.savefig('versibox.eps', format='eps', dpi=600)
plt.show()
verginica.boxplot()
plt.title('Iris-verginica boxplot')
plt.savefig('verginicabox.eps', format='eps', dpi=600)
plt.show()

andrews_curves(data, 'irisclass').legend(bbox_to_anchor=(0.4, 1))
plt.savefig('andrews_curve.eps', format='eps', dpi=600)
radviz(data, 'irisclass').legend(bbox_to_anchor=(1.1, 1))
plt.savefig('radviz.eps', format='eps', dpi=600)
plt.show()

parallel_coordinates(data, 'irisclass').legend(bbox_to_anchor=(1, 1))
plt.savefig('paracoor.eps', format='eps', dpi=600)
plt.show()

#plot scatter, correlation
sns.set(style="ticks")
sns.pairplot(data, hue="irisclass")
plt.savefig('scatermatrix.eps', format='eps', dpi=600)
plt.show()
示例#44
0
import pandas as pd

data = pd.read_csv('https://raw.github.com/pydata/pandas/master/pandas/tests/data/iris.csv')

data.head()


# In[ ]:


from pandas.tools.plotting import andrews_curves

plt.figure()

andrews_curves(data, 'Name')


# 表格函数应用
可以通过将函数和适当数量的参数作为管道参数来执行自定义操作。 因此,对整个DataFrame执行操作。
例如,为DataFrame中的所有元素相加一个值2。
# In[99]:


import pandas as pd
import numpy as np

def adder(ele1,ele2):
   return ele1+ele2

np.random.seed(293423)
示例#45
0
import matplotlib

from pandas.tools.plotting import andrews_curves

# Look pretty...
# matplotlib.style.use('ggplot')
plt.style.use('ggplot')


#
# TODO: Load up the Seeds Dataset into a Dataframe
# It's located at 'Datasets/wheat.data'
# 
df = pd.read_csv('Datasets/wheat.data')

#
# TODO: Drop the 'id' feature, if you included it as a feature
# (Hint: You shouldn't have)
# Also get rid of the 'area' and 'perimeter' features
# 

df = df.drop(labels=['id'], axis=1)

#
# TODO: Plot a parallel coordinates chart grouped by
# the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
# 
plt.figure()
andrews_curves(df, 'wheat_type', alpha=0.4)
plt.show()
示例#46
0
文件: 1.py 项目: jiamc618/work1
dataset.plot(kind='kde')
dataset.plot(kind='box', subplots=True, layout=(2, 2),
             sharex=False, sharey=False)
import pandas

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

from pandas.tools.plotting import radviz

radviz(dataset, 'class')

from pandas.tools.plotting import andrews_curves

andrews_curves(dataset, 'class')

from pandas.tools.plotting import parallel_coordinates

parallel_coordinates(dataset, 'class')
import pandas

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

from pandas.tools.plotting import scatter_matrix

scatter_matrix(dataset, alpha=0.2, figsize=(6, 6), diagonal='kde')
from sklearn.datasets import load_iris
示例#47
0

# Select the columns of interest.
# Our charts will be created from the new dataframes that we will create below.
df2 = df[['compactness','length','width','asymmetry','groove','wheat_type']]


# Create the parallel coordinates chart
from pandas.plotting import parallel_coordinates
plt.figure(figsize = (8,8))
parallel_coordinates(df2, 'wheat_type', alpha = 0.8)

# Save your chart in a selected format
plt.savefig("Wheat_Parallel_Coordinates.png", orientation = "landscape", dpi = 100)

plt.show()


# Create an andrews curves chart
from pandas.tools.plotting import andrews_curves


plt.figure(figsize = (8,8))
andrews_curves(df2,"wheat_type",alpha = 0.8)

# Save your chart in a selected format
plt.savefig("Wheat_Andrews_Curve.png", orientation = "landscape", dpi = 100)

plt.show()

# Ubicación de los datos de ROBIN
dir_GD = u"C:/Users/Miguel/Documents/1 Nube/GoogleDrive"
dir_ROBIN  = u"/2 Proyectos/RoBiN/Datos RoBiN/México/0_Vigente"
dir_clima = u"/GIS/Mapas_base/2004/clima"
os.chdir(dir_GD + dir_ROBIN + u"/GIS/Mapas_base/2004")
archivos = os.listdir(os.curdir)
nombre_archivo = [nombre for nombre in archivos if re.findall(".+clima-mx\\.csv$", nombre)][0]

# Lee los datos de clima
datos = pd.read_csv(nombre_archivo)
datos.head()

# Selected items in pandas dataframe
datos.iloc[1:5,3]

# the histogram of the data with histtype='step'
fig1 = plt.figure(figsize=(4,3))
datos.iloc[:,3].plot(kind="hist", alpha=0.5, bins = 100)
datos.iloc[:,4].plot(kind="hist", alpha=0.5, bins = 100)

pp_vars = list(datos.columns[2:3])
zvh_var = [datos.columns[38]]
melted = pd.DataFrame(pd.melt(datos, id_vars=zvh_var, value_vars=pp_vars))
pp_melted = melted[["value", "Zvh_observed"]]
plt.figure()
andrews_curves(melted[["value", "Zvh_observed"]], 'Zvh_observed')
melted.head()