def result_visualization(x_test, y_test, result):
    cols = y_test.shape[0]
    y = []
    pre = []
    labels = ['setosa', 'versicolor', 'virginica']

    # 将0、1、2转换成setosa、versicolor、virginica
    for i in range(cols):
        y.append(labels[y_test[i]])
        pre.append(labels[result[i]])

    # 将特征和类别矩阵拼接起来
    real = np.column_stack((x_test.T, y))
    prediction = np.column_stack((x_test.T, pre))

    # 转换成DataFrame类型,并添加columns
    df_real = pd.DataFrame(real, index=None, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species'])
    df_prediction = pd.DataFrame(prediction, index=None, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species'])

    # 将特征列转换为float类型,否则radviz会报错
    df_real[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']] = df_real[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].astype(float)
    df_prediction[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']] = df_prediction[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].astype(float)

    # 绘图
    plt.figure('真实分类')
    radviz(df_real, 'Species', color=['blue', 'green', 'red', 'yellow'])
    plt.figure('预测分类')
    radviz(df_prediction, 'Species', color=['blue', 'green', 'red', 'yellow'])
    plt.show()
Exemplo n.º 2
0
def result_visualization(x_test, y_test, result):
    cols = y_test.shape[1]
    y = []
    pre = []

    # 反转换类别的独热编码
    for i in range(cols):
        if y_test[0][i] == 0 and y_test[1][i] == 0 and y_test[2][i] == 1:
            y.append('setosa')
        elif y_test[0][i] == 0 and y_test[1][i] == 1 and y_test[2][i] == 0:
            y.append('versicolor')
        elif y_test[0][i] == 1 and y_test[1][i] == 0 and y_test[2][i] == 0:
            y.append('virginica')

    for j in range(cols):
        if result[0][j] == 0 and result[1][j] == 0 and result[2][j] == 1:
            pre.append('setosa')
        elif result[0][j] == 0 and result[1][j] == 1 and result[2][j] == 0:
            pre.append('versicolor')
        elif result[0][j] == 1 and result[1][j] == 0 and result[2][j] == 0:
            pre.append('virginica')
        else:
            pre.append('unknown')

    # 将特征和类别矩阵拼接起来
    real = np.column_stack((x_test.T, y))
    prediction = np.column_stack((x_test.T, pre))

    # 转换成DataFrame类型,并添加columns
    df_real = pd.DataFrame(real,
                           index=None,
                           columns=[
                               'Sepal Length', 'Sepal Width', 'Petal Length',
                               'Petal Width', 'Species'
                           ])
    df_prediction = pd.DataFrame(prediction,
                                 index=None,
                                 columns=[
                                     'Sepal Length', 'Sepal Width',
                                     'Petal Length', 'Petal Width', 'Species'
                                 ])

    # 将特征列转换为float类型,否则radviz会报错
    df_real[['Sepal Length', 'Sepal Width', 'Petal Length',
             'Petal Width']] = df_real[[
                 'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'
             ]].astype(float)
    df_prediction[[
        'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'
    ]] = df_prediction[[
        'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'
    ]].astype(float)

    # 绘图
    plt.figure('真实分类')
    radviz(df_real, 'Species', color=['blue', 'green', 'red', 'yellow'])
    plt.figure('预测分类')
    radviz(df_prediction, 'Species', color=['blue', 'green', 'red', 'yellow'])
    plt.show()
Exemplo n.º 3
0
def test_radviz():
    # data = pd.read_csv('e:/tmp/22/iris.csv')
    data = pd.read_csv('e:/tmp/22/201404.out')
    print data.head()

    plt.figure()
    radviz(data, 'Name')
    plt.show()
Exemplo n.º 4
0
def plot_radviz(dataset):
    """
    Generates a RadViz plot of the provided DataSet.  RadViz is useful for 
    visualizing data with more than two dimensions.
    """
    # radviz takes a pandas DataFrame and the name of the column which
    # contains class membership info.
    # therefore need to pass in the dataset's merged data and labels
    radviz(dataset.get_labelled_data_frame(), dataset.get_labels().name)
    plt.show()
Exemplo n.º 5
0
def result_visualization(x_test, y_test, result):
    cols = y_test.shape[1]
    y = []
    pre = []

    # 反转换类别的独热编码
    for i in range(cols):
        if y_test[0][i] == 0 and y_test[1][i] == 0 and y_test[2][i] == 1:
            y.append('setosa')
        elif y_test[0][i] == 0 and y_test[1][i] == 1 and y_test[2][i] == 0:
            y.append('versicolor')
        elif y_test[0][i] == 1 and y_test[1][i] == 0 and y_test[2][i] == 0:
            y.append('virginica')

    for j in range(cols):
        if result[0][j] == 0 and result[1][j] == 0 and result[2][j] == 1:
            pre.append('setosa')
        elif result[0][j] == 0 and result[1][j] == 1 and result[2][j] == 0:
            pre.append('versicolor')
        elif result[0][j] == 1 and result[1][j] == 0 and result[2][j] == 0:
            pre.append('virginica')
        else:
            pre.append('未知种类')

    # 将特征和类别矩阵拼接起来
    real = np.column_stack((x_test.T, y))
    prediction = np.column_stack((x_test.T, pre))

    df_real = pd.DataFrame(real,
                           index=None,
                           columns=['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度', '种类'])
    df_prediction = pd.DataFrame(
        prediction, index=None, columns=['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度', '种类'])

    df_real[['萼片长度', '萼片宽度', '花瓣长度',
             '花瓣宽度']] = df_real[['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度']].astype(float)
    df_prediction[['萼片长度', '萼片宽度', '花瓣长度',
                   '花瓣宽度']] = df_prediction[['萼片长度', '萼片宽度', '花瓣长度',
                                             '花瓣宽度']].astype(float)

    # 绘图

    plt.figure('真实分类')
    radviz(df_real, '种类', color=['blue', 'green', 'red', 'yellow'])
    plt.figure('预测分类')
    radviz(df_prediction, '种类', color=['blue', 'green', 'red', 'yellow'])
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签
    plt.rcParams['axes.unicode_minus'] = False
    plt.show()
Exemplo n.º 6
0
    def test_radviz(self, iris):
        from pandas.plotting import radviz
        from matplotlib import cm

        df = iris
        _check_plot_works(radviz, frame=df, class_column="Name")

        rgba = ("#556270", "#4ECDC4", "#C7F464")
        ax = _check_plot_works(radviz, frame=df, class_column="Name", color=rgba)
        # skip Circle drawn as ticks
        patches = [p for p in ax.patches[:20] if p.get_label() != ""]
        self._check_colors(patches[:10], facecolors=rgba, mapping=df["Name"][:10])

        cnames = ["dodgerblue", "aquamarine", "seagreen"]
        _check_plot_works(radviz, frame=df, class_column="Name", color=cnames)
        patches = [p for p in ax.patches[:20] if p.get_label() != ""]
        self._check_colors(patches, facecolors=cnames, mapping=df["Name"][:10])

        _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet)
        cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())]
        patches = [p for p in ax.patches[:20] if p.get_label() != ""]
        self._check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10])

        colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]]
        df = DataFrame(
            {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]}
        )
        ax = radviz(df, "Name", color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, facecolors=colors)
Exemplo n.º 7
0
def radviz_quad_features(filename=None):
    df = pd.read_csv(filename, index_col=0)
    df_angle = df.drop(labels=['TEXTURES'], axis=1)
    df_texture = df.drop(labels=['angle'], axis=1)
    # plt.style.use('ggplot')
    plt.style.use(['bmh'])  # ''classic', 'seaborn-dark'
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11, 6))
    # plt.suptitle("Visualization of quadrant features using Radviz ", fontsize=12)

    ax1 = radviz(df_texture, 'TEXTURES', ax=ax1, colormap='gist_rainbow')
    ax1.set_title('Textures - radviz', loc='center', fontsize=10)

    ax2 = radviz(df_angle, 'angle', ax=ax2, colormap='rainbow')
    ax2.set_title('Angles - radviz', loc='center', fontsize=10)
    # andrews_curves(df_angle, 'angle', ax=ax1, colormap='rainbow')
    # andrews_curves(df_texture, 'TEXTURES', ax=ax2, colormap='rainbow')
    fig.tight_layout()
    plt.subplots_adjust(left=0.05, wspace=0.15, top=0.9)
    plt.savefig(
        os.path.join(output_dir, 'Radviz_' + get_basename(filename) + '.png'))
Exemplo n.º 8
0
    def test_radviz(self, iris):
        from pandas.plotting import radviz
        from matplotlib import cm

        df = iris
        _check_plot_works(radviz, frame=df, class_column='Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(radviz,
                               frame=df,
                               class_column='Name',
                               color=rgba)
        # skip Circle drawn as ticks
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches[:10],
                           facecolors=rgba,
                           mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        _check_plot_works(radviz, frame=df, class_column='Name', color=cnames)
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10])

        _check_plot_works(radviz,
                          frame=df,
                          class_column='Name',
                          colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10])

        colors = [[0., 0., 1., 1.], [0., 0.5, 1., 1.], [1., 0., 0., 1.]]
        df = DataFrame({
            "A": [1, 2, 3],
            "B": [2, 1, 3],
            "C": [3, 2, 1],
            "Name": ['b', 'g', 'r']
        })
        ax = radviz(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, facecolors=colors)
Exemplo n.º 9
0
    def test_radviz(self):
        from pandas.plotting import radviz
        from matplotlib import cm

        df = self.iris
        _check_plot_works(radviz, frame=df, class_column='Name')

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(
            radviz, frame=df, class_column='Name', color=rgba)
        # skip Circle drawn as ticks
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(
            patches[:10], facecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        _check_plot_works(radviz, frame=df, class_column='Name', color=cnames)
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10])

        _check_plot_works(radviz, frame=df,
                          class_column='Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        patches = [p for p in ax.patches[:20] if p.get_label() != '']
        self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10])

        colors = [[0., 0., 1., 1.],
                  [0., 0.5, 1., 1.],
                  [1., 0., 0., 1.]]
        df = DataFrame({"A": [1, 2, 3],
                        "B": [2, 1, 3],
                        "C": [3, 2, 1],
                        "Name": ['b', 'g', 'r']})
        ax = radviz(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, facecolors=colors)
# Draw a heatmap with the numeric values in each cell
sns.heatmap(data1, annot=True, fmt='f', linewidths=1)
plt.show()

import pickle

with open('forest-riders.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('forest-riders.pkl', 'rb') as f:
    model = pickle.load(f)

from pandas.tools.plotting import radviz

plt.figure(figsize=(12, 12))
radviz(dataset, 'Yield')
plt.show()

from yellowbrick.features.rankd import Rank2D
# Instantiate the visualizer with the Covariance ranking algorithm
visualizer = Rank2D(features=features, algorithm='covariance')

visualizer.fit(X, y)  # Fit the data to the visualizer
visualizer.transform(X)  # Transform the data
visualizer.poof()  # Draw/show/poof the data

################################################3
#######RANDOM FOREST REGRESSOR TREE######

# features = ['Year','Harvested','Value','Grow_total_p','Grow_avg_t','Price']
# target = 'Yield'
Exemplo n.º 11
0
sns.set(style="white", color_codes=True)

iris = pd.read_csv("./Iris.csv")
print(iris.head())
print(iris["Species"].value_counts())
#散点图
iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")
#散点图+柱状图
sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5)
#散点图,加标签,用不同颜色区分
sns.FacetGrid(iris, hue="Species", size=5) \
   .map(plt.scatter, "SepalLengthCm", "SepalWidthCm") \
   .add_legend()
#箱型图
sns.boxplot(x="Species", y="PetalLengthCm", data=iris)

#箱型图
ax = sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
ax = sns.stripplot(x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray")

#核密度图
sns.FacetGrid(iris, hue="Species", size=6) \
   .map(sns.kdeplot, "PetalLengthCm") \
   .add_legend()
#多变量图
sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3)

#弹力图
radviz(iris.drop("Id", axis=1), "Species")
plt.show()
Exemplo n.º 12
0
def RadvizPlot():
    # 降维可视化
    data = load_data()
    radviz(data.drop("Id", axis=1), "Species")
    return
Exemplo n.º 13
0
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix, radviz
import pandas
from sklearn.utils import shuffle

# Load the data from the CSV file
data_frame = pandas.read_csv("spambase/spambase.csv")
# Randomize the dataframe
data_frame = shuffle(data_frame)
sample_frame = data_frame.sample(frac=1)
sample_frame = sample_frame.iloc[:, ::1]

print(sample_frame)
radviz(sample_frame, "spam")
plt.show()

# scatter_matrix(sample_frame, alpha=0.2)
# plt.show()
# calculating the accuracies
print("Training Accuracy :", model.score(x_train, y_train))
print("Testing Accuracy :", model.score(x_test, y_test))

# printing the confusion matrix

from sklearn.metrics import confusion_matrix

# creating a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# printing the confusion matrix
plt.rcParams['figure.figsize'] = (8, 8)
sns.heatmap(cm, annot = True, cmap = 'Reds')
plt.title('Confusion Matrix for Random Forest', fontweight = 30, fontsize = 20)
plt.show()

get_ipython().system('pip install yellowbrick')

from pandas.plotting import radviz
fig, ax = plt.subplots(figsize=(12, 12))
new_df = x.copy()
new_df["status"] = y
radviz(new_df, "status", ax=ax, colormap="rocket")
plt.title('Radial Visualization for Target', fontsize = 20)
plt.show()


# * It gives a clear Idea that Students getting very low grades have high correlation on Lunch and Parental Education
Exemplo n.º 15
0
species_dict = {0: species[0], 1: species[1], 2: species[2]}
iris_pair_df["species"] = target_df
iris_pair_df["species"] = iris_pair_df["species"].map(species_dict)
# sns.pairplot(iris_df, hue='species', size=2.5)

# ECDF
#
# ecdf = sm.distributions.ECDF(iris_df["sepal width (cm)"])
# x = np.linspace(iris_df["sepal width (cm)"].min(), iris_df["sepal width (cm)"].max())
# y = ecdf(x)
# plt.step(x, y)
# plt.title("Empirical CDF for Iris attributes - Sepal Width")
# plt.show()

# Percentile Plot
# position, sepal_width = probscale.plot_pos(iris_df["sepal width (cm)"])
# position *= 100
# fig, ax = plt.subplots(figsize=(6, 3))
# ax.plot(position, sepal_width, marker='.', label='Sepal Width')
# ax.set_xlabel('Percentile')
# ax.set_ylabel('Sepal Width (cm)')
# sns.despine()

# parallel coordinates
# parallel_coordinates(iris_pair_df, "species")
# plt.title('Parallel Coordinates visualization for Iris Dataset')

# radvis
radviz(iris_pair_df, "species")
plt.title('Radvis multivarieate visualization')
Exemplo n.º 16
0
def visualize_radial(df, col, title):
    plt.figure(title)
    radviz(df, col)
    plt.show()
Exemplo n.º 17
0
# We can visualize other features by substituting "meanfun"
sns.boxplot(x="label", y="meanfun", data=dataset)
plt.show()

# ------ Distribution of male and female(every feature)
# We can visualize other features by substituting "meanfun"
sns.FacetGrid(dataset, hue="label", size=6) \
    .map(sns.kdeplot, "meanfun") \
    .add_legend()
plt.show()

# ------ Radviz circle
# Good to compare every feature
from pandas.plotting import radviz

radviz(dataset, "label")
plt.show()

#####################################################
#	                                                 #
#        Starting with Sets and Pre-Processing      #
#	                                                 #
#####################################################

# ------ Separating the Independent and Dependent Variables
# Getting all Columns, except the last one with the genders
X = dataset.iloc[:, :-1].values
# Getting the last column
y = dataset.iloc[:, 20].values

# ------ Taking Care of Missing Data
import pandas
import matplotlib.pyplot as plt

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

from pandas.plotting import radviz
radviz(dataset, 'class')

from pandas.plotting import andrews_curves
andrews_curves(dataset, 'class')

from pandas.plotting import parallel_coordinates
parallel_coordinates(dataset, 'class')
plt.show()
Exemplo n.º 19
0
def visualisation(df_final,
                  visual_ctl,
                  Y,
                  X,
                  scatter_x,
                  features,
                  test,
                  Top_n_counts,
                  export_ctl=False,
                  dim_ctl=3):

    ### Raw
    if visual_ctl in ['all', 'raw']:
        if dim_ctl == 2:
            fig = px.scatter(df_final,
                             x=scatter_x[0],
                             y=scatter_x[1],
                             color='Top_n_similar',
                             size_max=18,
                             symbol='Class',
                             opacity=0.5)
        # 3d scatter plot
        elif dim_ctl == 3:
            fig = px.scatter_3d(df_final,
                                x=scatter_x[0],
                                y=scatter_x[1],
                                z=scatter_x[2],
                                color='Top_n_similar',
                                size_max=18,
                                symbol='Class',
                                opacity=0.5)

        # Layout
        fig.update_layout(
            margin=dict(l=0, r=0, b=0, t=0),
            title="Scatter Plot",
        )
        # Output
        if export_ctl == 'True':
            print(f'Export Raw to Output/Raw-3D.png')
            fig.write_image("Output/Raw-3D.png")

    ### Radviz
    if visual_ctl in ['all', 'radviz']:
        radviz_fig, ax = plt.subplots(nrows=1, ncols=1)
        # Turn off tick labels
        ax.set_yticklabels([])
        ax.set_xticklabels([])
        # Plot
        ax = radviz(df_final[scatter_x],
                    "Top_n_similar",
                    color=['Red', 'Orange', 'Blue', 'Green'],
                    alpha=0.5)
        ax.title.set_text('Radviz Plot of the Features')

        if export_ctl == 'True':
            print(f'Export Radviz to Output/MultiDimension_Radviz.png')
            radviz_fig.savefig('Output/MultiDimension_Radviz.png',
                               bbox_inches='tight')

    ### Parallel Coordinates
    if visual_ctl in ['all', 'paral_coor']:
        parallel_x = scatter_x.copy()
        parallel_x.append('similar_alpha')
        # parallel plot
        par_fig = px.parallel_coordinates(
            df_final[parallel_x],
            color="similar_alpha",
            dimensions=scatter_x,
            color_continuous_scale=px.colors.diverging.Tealrose,
            color_continuous_midpoint=2)
        # Layout
        par_fig.update_layout(
            title={
                'text': "Parrallel Plot for Iris Neighbors",
                'y': 0.1,
                'x': 0.5,
                'xanchor': 'center',
                'yanchor': 'top'
            })
        # Output
        if export_ctl == 'True':
            print(
                f'Export Parallel Coordinates to Output/MultiDimension_ParrCoor.png'
            )
            par_fig.write_image("Output/MultiDimension_ParrCoor.png")

    return raw_fig, radviz_fig, par_fig
#numerical data. Parallel Coordinates Plots are ideal for comparing many variables together and
#seeing the relationships between them. For example, if you had to compare an array of products with
#the same attributes (comparing computer or cars specs across different models).

from pandas.plotting import parallel_coordinates
parallel_coordinates(iris_data, "species")

# In[143]:

#Radviz Plot : RadViz is a multivariate data visualization algorithm that
#plots each feature dimension uniformly around the circumference of a
#circle then plots points on the interior of the circle such that the
#point normalizes its values on the axes from the center to each arc.

from pandas.plotting import radviz
radviz(iris_data, "species", color=['pink', 'green'])

# In[150]:

#Factorplot: Factor plot is informative when we have multiple groups to compare.

sns.factorplot("species", "sepal_length", data=iris_data)
plt.ioff()
plt.show()

# In[153]:

#Boxen Plot: An enhanced box plot for larger datasets.

fig = plt.gcf()
fig.set_size_inches(10, 6)
Exemplo n.º 21
0
def lp_genes(
    data,
    kind="scatter",
    hue="Pattern",
    sizes=(2, 100),
    gridsize=20,
    random_state=4,
    ax=None,
    fname=None,
    **kwargs,
):
    """
    Plot the pattern distribution of each gene in a RadViz plot. RadViz projects
    an N-dimensional data set into a 2D space where the influence of each dimension
    can be interpreted as a balance between the influence of all dimensions.

    Parameters
    ----------
    data : AnnData
        Spatial formatted AnnData
    kind : str
        'Scatter' for scatter plot, 'hex' for hex plot, default "scatter"
    hue : str
        Name of columns in data.obs to color points, default "Pattern"
    sizes : tuple
        Minimum and maximum point size to scale points, default (2, 100)
    gridsize : int
        Number of hex bins along each axis, default 20
    fname : str, optional
        Save the figure to specified filename, by default None
    **kwargs
        Options to pass to matplotlib plotting method.
    """
    lp_stats(data)

    palette = dict(zip(PATTERN_NAMES, PATTERN_COLORS))

    # RADVIZ plot
    if not ax:
        figsize = (6, 6)
        fig = plt.figure(figsize=figsize)

    # Use Plot the "circular" axis and labels, hide points
    # TODO move "pattern" computation to lp_stats
    col_names = [f"{p}_fraction" for p in PATTERN_NAMES]
    gene_frac = data.var[col_names]
    gene_frac.columns = PATTERN_NAMES
    gene_frac["Pattern"] = gene_frac.idxmax(axis=1)
    gene_frac_copy = gene_frac.copy()
    gene_frac_copy["Pattern"] = ""

    if hue and hue != "Pattern":
        gene_frac = gene_frac.join(data.var[hue])

    if not ax:
        ax = radviz(gene_frac_copy, "Pattern", s=0)
    else:
        radviz(gene_frac_copy, "Pattern", s=0, ax=ax)
    del gene_frac_copy
    ax.get_legend().remove()
    circle = plt.Circle((0, 0), radius=1, color="black", fill=False)
    ax.add_patch(circle)

    # Hide 2D axes
    ax.axis(False)

    # Get points
    pts = []
    for c in ax.collections:
        pts.extend(c.get_offsets().data)

    pts = np.array(pts).reshape(-1, 2)
    xy = pd.DataFrame(pts, index=gene_frac.index)
    xy["Pattern"] = gene_frac["Pattern"]

    # Plot points as scatter or hex
    if kind == "scatter":

        del ax.collections[0]

        # Scale point size by max
        xy["Fraction of cells"] = gene_frac.iloc[:, :5].max(axis=1)

        # Plot points
        sns.scatterplot(
            data=xy.sample(frac=1, random_state=random_state),
            x=0,
            y=1,
            size="Fraction of cells",
            hue=hue,
            sizes=sizes,
            linewidth=0,
            palette=palette,
            ax=ax,
            **kwargs,
        )
        plt.legend(bbox_to_anchor=(1.05, 0.5),
                   loc="center left",
                   frameon=False)

    elif kind == "hex":
        # Hexbin
        xy.plot.hexbin(
            x=0,
            y=1,
            gridsize=gridsize,
            extent=(-1, 1, -1, 1),
            cmap=sns.light_palette("lightseagreen", as_cmap=True),
            mincnt=1,
            colorbar=False,
            ax=ax,
            **kwargs,
        )
        # [left, bottom, width, height]
        plt.colorbar(ax.collections[-1],
                     cax=fig.add_axes([1, 0.4, 0.05, 0.3]),
                     label="genes")
Exemplo n.º 22
0
    list_of_best_features.append(df.columns[our_index])
    copy_of_importance_list[our_index] = 0 

#We create a new array containing only the n features 
df_scaled_new = df_scaled
df_scaled_new = df_scaled[list_of_best_features]
X = df_scaled_new.values

# Examine the dispersion of the defaults/non defaulted amongst the features in a multivariate setting
from pandas.plotting import radviz
plt.figure(figsize=(8,8))
df_all_data = pd.read_csv(your_path) 
Scaler = StandardScaler()
scaled_data = Scaler.fit_transform(df_all_data)
df_all_data_scaled = pd.DataFrame(scaled_data , columns=df_all_data.columns)
radviz(df_all_data_scaled, 'Default', color='BGR');

# Same thing but this time dropping some of the features for clarity
plt.figure(figsize=(10,10))
df_all_data = pd.read_csv(your_path) 
less_interesting_features=['Guarantors','Sex & Marital Status','Instalment per cent', 'Duration in Current address','Age (years)','Most valuable available asset', 'Concurrent Credits', 'Type of apartment', 'No of Credits at this Bank','Occupation', 'No of dependents', 'Telephone', 'Foreign Worker','Value Savings/Stocks']
df_all_data_dropped = df_all_data.drop(less_interesting_features, axis=1)
radviz(df_all_data_dropped, 'Default',color='BGR');

#   //////////////////////////////////////////////////////////////
#  ////////              Train - Test - Split             ///////
# //////////////////////////////////////////////////////////////

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.30, random_state=7)
Exemplo n.º 23
0
def plot_radviz(plotdata,label):
    plotdata['label'] = label
    plt.figure('kmeans-radviz',figsize=(100,50))
    plt.title('radviz')
    radviz(plotdata, 'label')
    plt.show()
Exemplo n.º 24
0
#plt.show()

#9 this shows the relationship between the various sepal/petal lengths/widths measures
g = sns.pairplot(iris, hue="name")
plt.savefig("../graphs/pairploth.jpg")
#plt.show()
# similar to above but with kde along the diag instead of histograms
#
g = sns.pairplot(iris, hue="name", diag_kind="kde")
plt.savefig("../graphs/pairplotk.jpg")
#plt.show()

#10 Andrews curves are a method for visualizing multidimensional data by mapping each observation onto a function. In the plot each colour used represents a class and we can easily note that the lines that represent samples from the same class have similar curves.Andrews curves that are represented by functions close together suggest that the corresponding data points will also be close together.
from pandas.plotting import andrews_curves
andrews_curves(iris, "name")
plt.savefig("../graphs/andrewcs.jpg")
#plt.show()

#11 Another multivariate visualization technique pandas has is parallel_coordinates
# Parallel coordinates plots each feature on a separate column & then draws lines
# connecting the features for each data sample
#Inselberg (Inselberg 1997) made a full review of how to visually read out parallel coords' relational patterns.[9] When most lines between two parallel axis are somewhat parallel to each others, that suggests a positive relationship between these two dimensions. When lines cross in a kind of superposition of X-shapes, that's negative relationship. When lines cross randomly or are parallel, that show there is no particular relationship.
from pandas.plotting import parallel_coordinates
parallel_coordinates(iris, "name")
plt.savefig("../graphs/parac.jpg")
#plt.show()
#12
from pandas.plotting import radviz
radviz(iris, "name")
plt.savefig("../graphs/rad.jpg")
#plt.show()
Exemplo n.º 25
0
# A [""] as last
data = data[:-1]
trans_dict = {
    "Iris-setosa": "0",
    "Iris-versicolor": "1",
    "Iris-virginica": "2"
}
for i in data:
    i[-1] = trans_dict[i[-1]]

data = np.asarray(data, dtype="float32")
numpy_data = data.copy()
data = pd.DataFrame(data)
# Change columns to string.
data.rename(mapper=lambda x: str(x), axis=1, inplace=True)
radviz(data, class_column="4")
plt.show()

x, y = np.split(numpy_data, (4, ), axis=1)

# PCA
# Col as a var, row as attr.
M = np.mean(x, axis=0)
x = x - M

C = np.cov(x.T)

eigenvalue, eigenvetcor = np.linalg.eig(C)
# Get the index of reversed sorted eigenvalue
sorted_index = np.argsort(-eigenvalue)[:P]
# Get the first p vectors to get the projection matrix.
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix, andrews_curves, parallel_coordinates, radviz
import matplotlib.pyplot as plt

#data = pd.read_csv('test.csv')
#data = pd.read_csv('BostonHousing.csv')
#scatter_matrix(data, alpha=0.2, figsize=(10, 10), diagonal='kde')
#plt.suptitle('scatter-matrix')
#plt.show()

# data = pd.read_csv('BostonHousing.csv')
# data.plot.line(x='medv', y=['rad', 'indus', 'ptratio', 'lstat'], figsize=(10, 10))
# plt.show()

data = pd.read_csv('test.csv')
#data = data.cumsum()
radviz(data, 'medv')
plt.show()
# print (data.head())
Exemplo n.º 27
0
print('################################')
print('Working Base :', Base, ' using ', sys.platform)
print('################################')
################################################################
sDataFile = Base + '/01-Vermeulen/00-RawData/irisdata.csv'

data = pd.read_csv(sDataFile)

from pandas.plotting import andrews_curves
plt.figure(figsize=(10, 10))
andrews_curves(data, 'Name')
sPicNameOut1 = Base + '/01-Vermeulen/06-Report/01-EDS/02-Python/andrews_curves.png'
plt.savefig(sPicNameOut1, dpi=600)
plt.tight_layout()
plt.show()

from pandas.plotting import parallel_coordinates
plt.figure(figsize=(10, 10))
parallel_coordinates(data, 'Name')
sPicNameOut2 = Base + '/01-Vermeulen/06-Report/01-EDS/02-Python/parallel_coordinates.png'
plt.savefig(sPicNameOut2, dpi=600)
plt.tight_layout()
plt.show()

from pandas.plotting import radviz
plt.figure(figsize=(10, 10))
radviz(data, 'Name')
sPicNameOut3 = Base + '/01-Vermeulen/06-Report/01-EDS/02-Python/radviz.png'
plt.savefig(sPicNameOut3, dpi=600)
plt.tight_layout()
plt.show()
Exemplo n.º 28
0

# 六边形箱图(蜂窝图)
# pandas绘图
# gridsize#: x轴方向分箱数目 默认100
salary.plot.hexbin(x='salary', y='begin_salary', gridsize=25)
clf_cla_close(plt)
# 描述类似气泡图的散点值
salary.plot.hexbin(x='salary', y='begin_salary', C='age', reduce_C_function=np.min, gridsize=25)
clf_cla_close(plt)


# 雷达坐标图(属性图)
fig = plt.figure()
# 1#: 要分析对象 2#: 分类变量
radviz(salary[['salary', 'begin_salary', 'age', 'education', 'jobtime', 'position']], 'position')
clf_cla_close(plt)


# 轮廓图(横坐标表示需要分析的变量 纵坐标各个指标的值)
# 1#: 要分析对象 2#: 分类变量
parallel_coordinates(salary[['salary', 'begin_salary', 'jobtime', 'position']], 'position')
clf_cla_close(plt)


# 调和曲线图(根据三角变换方法将高维空间上的点映射到二维平面的曲线上)
andrews_curves(salary[['salary', 'begin_salary', 'jobtime', 'position']], 'position')
clf_cla_close(plt)


# 等高线图
Exemplo n.º 29
0
fig = plt.figure()
scatter_matrix(iris_pd, alpha=0.3, diagonal='kde')
iris_fig(fig, title)

title = 'Andrews_Curves'
# Very cool way to try to differentiate between classes. Some math is needed
# https://en.wikipedia.org/wiki/Andrews_plot
# http://sci-hub.cc/
fig = plt.figure()
andrews_curves(iris_pd, 'tgt')
iris_fig(fig, title)

title = 'Radviz'
# Springy area plots
fig = plt.figure()
radviz(iris_pd, 'tgt')
iris_fig(fig, title)

# %% Machine Learning - Classification Tree
# http://docs.python-guide.org/en/latest/scenarios/ml/

# Randomizes order of indices for splitting the data into train and test sets
ids = np.random.permutation(len(x))

# Gets n-10 data points for training
x_train = x[ids[:-10]]
y_train = y[ids[:-10]]

# Gets 10 data points for testing
x_test = x[ids[-10:]]
y_test = y[ids[-10:]]
plt.scatter(X_2[y==1,0], X_2[y==1,1], color='r') # 1分类的散点图
plt.savefig("./Pictures/raw_scatter_2.png") # 保存原始数据分布图
plt.cla() # 清除图片

# 原始数据可视化(平行坐标)
data = pd.read_csv(r"iris.csv") # 读取数据
plt.figure('多维度-parallel_coordinates')
plt.title('parallel_coordinates') # 添加标题
parallel_coordinates(data, 'Class', color=['b', 'g'])
plt.savefig("./Pictures/raw_parallel_coordinates.png") # 保存原始数据分布图
plt.cla() # 清除图片

# 原始数据可视化(RadViz雷达图)
plt.figure('多维度-radviz')
plt.title('radviz')
radviz(data, 'Class', color=['red', 'm'])
plt.savefig("./Pictures/raw_radviz.png") # 保存原始数据分布图
plt.cla() # 清除图片

# 原始数据可视化(andrews_curves)
plt.figure('多维度-andrews_curves')
plt.title('andrews_curves')
andrews_curves(data, 'Class', color=['pink', 'gold'])
plt.savefig("./Pictures/raw_andrews_curves.png") # 保存原始数据分布图
plt.cla() # 清除图片

# 特征相关性热力图
data = pd.read_csv(r"iris.csv")
data = data.iloc[:,[0,1,2,3]]

def heat_map(df):
Exemplo n.º 31
0
dataF = pd.read_csv("letter-recognition.data",
                    sep=',',
                    header=None,
                    names=columns)
for x in columns:
    if x == 'y1': continue
    print(x + " max: " + str(dataF[x].max()), end="    ")
    print("mean: " + str(dataF[x].mean()), end="    ")
    print("min: " + str(dataF[x].min()))

if 1:
    dataF['y1'] = dataF['y1'].apply(lambda y: ord(y) - 65)
    scatter_matrix(dataF, alpha=0.2, figsize=(6, 6), diagonal='kde')
    plt.show()

if 0:
    # Parallel Coordinates
    plt.figure()
    radviz(dataF, 'y1')
    plt.show()

if 0:
    ser = dataF
    ser.plot.kde(subplots=True,
                 layout=(2, 12),
                 legend=False,
                 Label=False,
                 yticks=[],
                 xticks=[])
    plt.show()