示例#1
0
def pcoords(X, y, outpath, **kwargs):
    # Create a new figure and axes
    _, ax = plt.subplots()

    # Create the visualizer
    visualizer = ParallelCoordinates(ax=ax, **kwargs)
    visualizer.fit_transform(X, y)

    # Save to disk
    visualizer.poof(outpath=outpath)
示例#2
0
    def pcoords_time(X, y, fast=True):
        _, ax = plt.subplots()
        oz = ParallelCoordinates(fast=fast, ax=ax)

        start = time.time()
        oz.fit_transform(X, y)
        delta = time.time() - start

        plt.cla()        # clear current axis
        plt.clf()        # clear current figure
        plt.close("all") # close all existing plots

        return delta
示例#3
0
def plot_fast_vs_slow():
    data = load_iris()

    _, axes = plt.subplots(nrows=2, figsize=(9,9))

    for idx, fast in enumerate((False, True)):
        title = "Fast Parallel Coordinates" if fast else "Standard Parallel Coordinates"
        oz = ParallelCoordinates(ax=axes[idx], fast=fast, title=title)
        oz.fit_transform(data.data, data.target)
        oz.finalize()

    plt.tight_layout()
    plt.savefig("images/fast_vs_slow_parallel_coordinates.png")
示例#4
0
    def pcoords_time(X, y, fast=True):
        _, ax = plt.subplots()
        oz = ParallelCoordinates(fast=fast, ax=ax)

        start = time.time()
        oz.fit_transform(X, y)
        delta = time.time() - start

        plt.cla()        # clear current axis
        plt.clf()        # clear current figure
        plt.close("all") # close all existing plots

        return delta
示例#5
0
def test_parallel_coords(pandas=False, outpath=None):
    """
    Runs the parallel coordinates visualizer on the dataset.

    Parameters
    ----------
    pandas : bool
        Run the pandas version of the function
    outpath : path or None
        Save the figure to disk rather than show (if None)
    """
    data = load_data('occupancy')  # Load the data
    features = ['temp', 'humid', 'light', 'co2', 'hratio']
    classes = ['unoccupied', 'occupied']
    X = data[features].as_matrix()
    y = data.occupied.as_matrix()

    if pandas:
        parallel_coordinates(data[features + ['occupied']], 'occupied')
        if outpath:
            plt.savefig(outpath)
        else:
            plt.show()

    else:
        visualizer = ParallelCoordinates(  # Instantiate the visualizer
            classes=classes, features=features)
        visualizer.fit(X, y)  # Fit the data to the visualizer
        visualizer.transform(X)  # Transform the data
        visualizer.poof(outpath=outpath)  # Draw/show/poof the data
示例#6
0
def pcoords(X, y, outpath, **kwargs):
    # Create a new figure and axes
    _, ax = plt.subplots()

    # Create the visualizer
    visualizer = ParallelCoordinates(ax=ax, **kwargs)
    visualizer.fit(X, y)
    visualizer.transform(X)

    # Save to disk
    visualizer.poof(outpath=outpath)
示例#7
0
def pcoords(X, y, outpath, **kwargs):
    # Create a new figure and axes
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # Create the visualizer
    visualizer = ParallelCoordinates(**kwargs)
    visualizer.fit(X, y)
    visualizer.transform(X)

    # Save to disk
    visualizer.poof(outpath=outpath)
示例#8
0
def plot_fast_vs_slow():
    data = load_iris()

    _, axes = plt.subplots(nrows=2, figsize=(9,9))

    for idx, fast in enumerate((False, True)):
        title = "Fast Parallel Coordinates" if fast else "Standard Parallel Coordinates"
        oz = ParallelCoordinates(ax=axes[idx], fast=fast, title=title)
        oz.fit_transform(data.data, data.target)
        oz.finalize()

    plt.tight_layout()
    plt.savefig("images/fast_vs_slow_parallel_coordinates.png")
示例#9
0
def pcoords(ax):
    from yellowbrick.features import ParallelCoordinates

    # Specify the features of interest and the classes of the target
    features = ["temperature", "relative humidity", "light", "C02", "humidity"]
    target = "occupancy"
    classes = ['unoccupied', 'occupied']

    # Load the data
    X, y = load_data('occupancy', cols=features, target=target)

    # Instantiate and fit the visualizer
    visualizer = ParallelCoordinates(ax=ax, classes=classes, features=features)
    visualizer.title = "Parallel Coordinates of Features to Predict Room Occupancy"
    visualizer.fit(X, y)
    visualizer.transform(X)
    return visualizer
示例#10
0
o = o[~np.all(o == 0, axis=1)]

binding_sites = bs
other = o
binding_sites_labels = np.ones(binding_sites.shape[0], dtype=np.uint8)
other_labels = np.zeros(other.shape[0], dtype=np.uint8)
X = np.concatenate((binding_sites, other))
y = np.concatenate((binding_sites_labels, other_labels))

# %%
visualizer = ClassBalance(labels=class_names)
visualizer.fit(y)
visualizer.poof()

# %%
visualizer = ParallelCoordinates()
visualizer.fit_transform(X, y)
visualizer.poof()

# %%
visualizer = Rank1D()
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

# %%
visualizer = Rank2D()
visualizer.fit_transform(X)
visualizer.poof()

# %%
示例#11
0
		visualizer.finalize()
		plt.savefig(path.join(PLOT_DIR, abbrev + "_km-rp_silhouette.png"), bbox_inches='tight')
		visualizer.show()
		plt.close()

		# pairplot
		print("# Scatterplot for " + label)
		sns.set(style="ticks")
		grid = sns.pairplot(df, hue="cluster", vars=feature_names)
		plt.subplots_adjust(top=0.96)
		grid.fig.suptitle(label + ": K-means k=" + str(best_k))
		plt.savefig(path.join(PLOT_DIR, abbrev + "_km-rp_scatter.png"), bbox_inches='tight')
		plt.show()
		plt.close()

		# parallel coordinates plot
		print("# Parallel Coordinates Plot for " + label)
		visualizer = ParallelCoordinates(features=feature_names, sample=0.1, shuffle=True, fast=True)
		visualizer.fit_transform(X, y_pred)
		visualizer.ax.set_xticklabels(visualizer.ax.get_xticklabels(), rotation=45, horizontalalignment='right')
		visualizer.finalize()
		plt.savefig(path.join(PLOT_DIR, abbrev + "_km-rp_parallel.png"), bbox_inches='tight')
		visualizer.show()
		plt.close()

		# compare with ground truth (classes)
		print(label + ": Homogeneity Score = " + str(metrics.homogeneity_score(y, y_pred)))
		print(label + ": V Measure Score = " + str(metrics.v_measure_score(y, y_pred)))
		print(label + ": Mutual Info Score = " + str(metrics.mutual_info_score(y, y_pred)))
		print(label + ": Adjusted Rand Index = " + str(metrics.adjusted_rand_score(y, y_pred)))
示例#12
0
def pcoords():
    X, y = load_occupancy()
    oz = ParallelCoordinates(sample=0.05, shuffle=True, ax=newfig())
    oz.fit_transform(X, y)
    savefig(oz, "parallel_coordinates")
		plt.xlabel('number of components')
		plt.ylabel('variance (%)')
		plt.title(label + ": Explained Variance by Number of Components")
		plt.savefig(path.join(PLOT_DIR, abbrev + "_pca_variance.png"), bbox_inches='tight')
		plt.show()
		plt.close()

		# save as new set of features
		pca = PCA(n_components=n_components, svd_solver='full', random_state=SEED)
		start_time = time.perf_counter()
		df = pd.DataFrame(pca.fit_transform(X))
		run_time = time.perf_counter() - start_time
		print(label + ": run time = " + str(run_time))
		df.to_pickle(path.join(PKL_DIR, abbrev + "_pca.pickle"))

		# parallel coordinates plot
		visualizer = ParallelCoordinates(sample=0.2, shuffle=True, fast=True)
		visualizer.fit_transform(df, y)
		visualizer.ax.set_xticklabels(visualizer.ax.get_xticklabels(), rotation=45, horizontalalignment='right')
		visualizer.finalize()
		plt.savefig(path.join(PLOT_DIR, abbrev + "_pca_parallel.png"), bbox_inches='tight')
		visualizer.show()
		plt.close()

		# output reconstruction error
		recon_err = get_reconstruction_error_invertable(X, df, pca)
		print(label + ": reconstruction error = " + str(recon_err))

		# distribution of eigenvalues
		print(label + ": eigenvalues?", pca.components_)
num_features = ['Age', 'SibSp', 'Parch', 'Fare']

# copy data to a new dataframe
data_norm = data.copy()
# normalize data to 0-1 range
for feature in num_features:
    data_norm[feature] = (data[feature] - data[feature].mean(skipna=True)) / (
        data[feature].max(skipna=True) - data[feature].min(skipna=True))

# Extract the numpy arrays from the data frame
X = data_norm[num_features].to_numpy()
y = data.Survived.to_numpy()

# Instantiate the visualizer
# Instantiate the visualizer
visualizer = ParallelCoordinates(classes=classes, features=num_features)

visualizer.fit(X, y)  # Fit the data to the visualizer
visualizer.transform(X)  # Transform the data
visualizer.poof(outpath="d://pcoords2.png")  # Draw/show/poof the data
plt.show()

# Step 10 - stacked bar charts to compare survived/not survived
# set up the figure size
# %matplotlib inline
plt.rcParams['figure.figsize'] = (20, 10)

# make subplots
fig, axes = plt.subplots(nrows=2, ncols=2)

# make the data read to feed into the visulizer
    # pairplot
    print("# Scatterplot for " + label)
    sns.set(style="ticks")
    grid = sns.pairplot(df, hue="cluster", vars=feature_names)
    plt.subplots_adjust(top=0.96)
    grid.fig.suptitle(label + ": K-means k=" + str(best_k))
    plt.savefig(path.join(PLOT_DIR, abbrev + "_em_scatter.png"),
                bbox_inches='tight')
    plt.show()
    plt.close()

    # parallel coordinates plot
    print("# Parallel Coordinates Plot for " + label)
    visualizer = ParallelCoordinates(features=feature_names,
                                     sample=0.1,
                                     shuffle=True,
                                     fast=True)
    visualizer.fit_transform(X, y_pred)
    visualizer.ax.set_xticklabels(visualizer.ax.get_xticklabels(),
                                  rotation=45,
                                  horizontalalignment='right')
    visualizer.finalize()
    plt.savefig(path.join(PLOT_DIR, abbrev + "_em_parallel.png"),
                bbox_inches='tight')
    visualizer.show()
    plt.close()

    # compare with ground truth (classes)
    print(label + ": Homogeneity Score = " +
          str(metrics.homogeneity_score(y, y_pred)))
    print(label + ": V Measure Score = " +
示例#16
0
oz = Rank2D(features=cols, algorithm='covariance')
oz.fit_transform(X, y)
oz.poof()

g = sns.jointplot(x='review_count', y='rating', kind='hex', data=data)

h = sns.jointplot(x='price', y='rating', kind='hex', data=data)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
oz = RadViz(classes=label_encoder.classes_, features=cols)
oz.fit(X, y)
oz.poof()

oz = ParallelCoordinates(classes=label_encoder.classes_, features=cols)
oz.fit(X, y)
oz.poof()

oz = ParallelCoordinates(normalize='minmax', classes=label_encoder.classes_, features=cols)
oz.fit(X, y)
oz.poof()

df = pd.DataFrame(data)
numeric_features = [
    "price",
    "rating",
    "review_count",
    "high_risk_1",
    "medium_risk_2",
    "low_risk_2"
示例#17
0
from sklearn.datasets import load_iris
data = load_iris()
data
import pandas as pd
pd.read_csv(data.filename)
from yellowbrick.features import ParallelCoordinates
viz = ParallelCoordinates(features=data.feature_names,
                          classes=data.target_names,
                          normalize='standard')
viz.fit(data.data, data.target)
viz.transform(data.data)
viz.poof()
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = StandardScaler()
scaler.fit(X_train)
X1 = scaler.transform(X_train)

clf = DecisionTreeClassifier()
clf.fit(X1, y_train)
X2 = scaler.transform(X_test)
clf.predict(X2)
from sklearn.pipeline import Pipeline
decision_pipeline = Pipeline([
    ('normalize', StandardScaler()),
    ('decision', DecisionTreeClassifier())
])
decision_pipeline.fit(X_train, y_train)
示例#18
0
tf = transformation(raw_data).fill_na_columns(columns=[]).drop_columns(
    columns=[])
X, y = tf.create_X_y()

mask = IsolationForest(contamination=0.15).fit_predict(X[column].to_frame(),
                                                       y) == 1
new_X = X[mask]
new_y = y[mask]

X_scaled = StandardScaler().fit_transform(new_X)

target_names = ['f', 's']

visualizer = ParallelCoordinates(classes=target_names,
                                 features=list(X.columns),
                                 sample=0.5,
                                 shuffle=True)
visualizer.fit_transform(X_scaled, new_y)
visualizer.show()

# %%
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

X, y = tf.cats_to_one_hot(columns=[]).create_X_y()

X_scaled = StandardScaler().fit_transform(X)

X = X.values

pca = PCA(n_components=2)
示例#19
0
	print(len(features))
	XT = X[:,numpy.asarray(features_ordered)]
	print(X.shape, XT.shape, Y.shape)
	"""
    numpy.random.seed(1)
    XT = numpy.arange(1000).reshape((-1, 2))
    YT = numpy.random.randint(6, size=len(XT)) * 10
    print(XT.shape, XT.dtype, YT.shape, YT.dtype)

    visualizer = RadViz(classes=[0, 10, 20, 30, 40, 50])
    visualizer.fit_transform(XT, YT)
    visualizer.poof(outpath='viz_feature_radviz.pdf', bbox_inches='tight')
    plt.close('all')

    visualizer = ParallelCoordinates(classes=[0, 10, 20, 30, 40, 50],
                                     sample=0.05,
                                     shuffle=True,
                                     fast=True)
    visualizer.fit_transform(XT, YT)
    visualizer.poof(outpath='viz_feature_parallel_coords.pdf',
                    bbox_inches='tight')
    plt.close('all')
    XT = X
    YT = Y

    print(XT.shape, XT.dtype, YT.shape, YT.dtype)
    visualizer = RadViz(classes=labels)
    visualizer.fit_transform(XT, YT)
    visualizer.poof(outpath='viz_feature_radviz.pdf', bbox_inches='tight')
    plt.close('all')

    visualizer = ParallelCoordinates(classes=labels,
# copy data to a new dataframe
data_norm = data.copy()
# normalize data to 0-1 range
for feature in num_features:
    data_norm[feature] = (data[feature] - data[feature].mean(skipna=True)) / (
        data[feature].max(skipna=True) - data[feature].min(skipna=True))

# Extract the numpy arrays from the data frame
# X = data_norm[num_features].as_matrix()
# y = data.Survived.as_matrix()
X = data_norm[num_features].to_numpy()
y = data.BendCurve.to_numpy()

# Instantiate the visualizer
visualizer = ParallelCoordinates(classes=classes, features=num_features)

visualizer.fit(X, y)  # Fit the data to the visualizer
visualizer.transform(X)  # Transform the data
visualizer.poof(outpath="pcoords2.png")  # Draw/show/poof the data
plt.show()

print("Starting Step 10 Here ")
# Step 10 - stacked bar charts to compare survived/not survived
#set up the figure size
#%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 10)

# make subplots
fig, axes = plt.subplots(nrows=2, ncols=2)
# create a copy of original df
data_norm = df.copy()

# normalize values in new copy
for feature in num_features:
    data_norm[feature] = (df[feature] - df[feature].min(skipna=True)) / (
        df[feature].max(skipna=True) - df[feature].min(skipna=True))

# convert values to numpy arrays
X = data_norm[num_features].to_numpy()
y = df.Survived.to_numpy()

# set up visualizer
from yellowbrick.features import ParallelCoordinates

visualizer = ParallelCoordinates(classes=classes, features=num_features)

# fit visualizer
visualizer.fit(X, y)
visualizer.transform(X)
# create PNG file and also display in shell
visualizer.show(outpath="titanic_fig4.png")
visualizer.show()

# set figure size, make subplots
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(nrows=2, ncols=2)

# convert binary to survived/not survived, group by sex
Sex_survived = df.replace({'Survived': {
    1: 'Survived',