Python scatter_matrix 예제들, pandas.scatter_matrix Python 예제들

예제 #1

0

파일 보기

파일: photometricRedshifts.py 프로젝트: eddienko/EuclidVisibleInstrument

def loadSDSSdata(folder='/Users/sammy/Google Drive/MachineLearning/AstroSDSS/', filename="qso10000.csv",
                 plot=False):
    """
    Load SDSS QSO data. The redshift range is rather broard from about 0.3 to 6.
    """
    filename = folder + filename
    qsos = pd.read_csv(filename,index_col=0, usecols=["objid","dered_r","spec_z","u_g_color",
                                                      "g_r_color","r_i_color","i_z_color","diff_u",
                                                      "diff_g1","diff_i","diff_z"])

    qsos = qsos[(qsos["dered_r"] > -9999) & (qsos["g_r_color"] > -10) & (qsos["g_r_color"] < 10)]
    qso_features = copy.copy(qsos)
    qso_redshifts = qsos["spec_z"]
    del qso_features["spec_z"]

    if plot:
        ## truncate the color at z=2.5 just to keep some contrast.
        norm = mpl.colors.Normalize(vmin=min(qso_redshifts.values), vmax=2.5)
        cmap = cm.jet_r
        m = cm.ScalarMappable(norm=norm, cmap=cmap)
        pd.scatter_matrix(qso_features[0:2000], alpha=0.2, figsize=[15, 15],
                          color=m.to_rgba(qso_redshifts.values))
        plt.savefig('Sample.pdf')
        plt.close()

    X_train, X_test, y_train, y_test = train_test_split(qso_features.values, qso_redshifts.values,
                                                        random_state=42)

    print "feature vector shape=", qso_features.values.shape
    print 'Training sample shape=', X_train.shape
    print 'Testing sample shape=', X_test.shape

    return X_train, X_test, y_train, y_test

예제 #2

0

파일 보기

파일: make_full_feature.py 프로젝트: ajschumacher/Craigslist-Arbitrage

def feature_m(df_all):
    df_X = df_all[['upgraded_HD', 
                   'upgraded_cpu', 
                   'upgraded_memory', 
                   'apple_care',
                   'year',
                   'px', 
                   'cpu_speed',
                   'image_url_ct',
                   'memory',
                   'HD_size']]

    df_X['apple_care'] = binarize_boolean_series(df_X['apple_care'])
    df_X['upgraded_HD'] = binarize_boolean_series(df_X['upgraded_HD'])
    df_X['upgraded_memory'] = binarize_boolean_series(df_X['upgraded_memory'])
    df_X['upgraded_cpu'] = binarize_boolean_series(df_X['upgraded_cpu'])        
    df_X['year'] = df_X['year'].astype(int)
    df_X['px'] = df_X['px'].astype(int)
    df_X['cpu_speed'] = df_X['cpu_speed'].astype(float)
    df_X['HD_size'] = df_X['HD_size'].astype(float)
    df_X['memory'] = df_X['memory'].astype(int)

    pd.scatter_matrix(df_X, figsize=(15,15));
    y = df_X.pop('year').ravel()
    X = np.array(df_X)
    return X, y

예제 #3

0

파일 보기

파일: test_deprecated.py 프로젝트: Jengel1/SunriseSunsetTimeFinder

    def test_scatter_plot_legacy(self):
        df = pd.DataFrame(randn(100, 2))

        with tm.assert_produces_warning(FutureWarning):
            plotting.scatter_matrix(df)

        with tm.assert_produces_warning(FutureWarning):
            pd.scatter_matrix(df)

예제 #4

0

파일 보기

파일: DyStatsTableWidget.py 프로젝트: hack1943/DevilYuan

    def _scatterMatrixAct(self):
        df = self.getNumberDataFrame()
        if df is None: return

        DyMatplotlib.newFig()

        pd.scatter_matrix(df)
        plt.gcf().show()

예제 #5

0

파일 보기

파일: movie_analyze_graph.py 프로젝트: agatorano/Hollywood_Projection

def show_scatter(data, col):

    '''
    shows a scatter matrix of the data
    '''

    if col:
        pd.scatter_matrix(data[col], figsize=(10, 10))
    else:
        pd.scatter_matrix(data, figsize=(10, 10))

예제 #6

0

파일 보기

파일: chap8.py 프로젝트: makora9143/python_for_data_analysis

def slide_13():
    macro = pd.read_csv(MACRODATAPATH)
    data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
    trans_data = np.log(data).diff().dropna()
    print trans_data[-5:]

    plt.scatter(trans_data['m1'], trans_data['unemp'])
    plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))

    pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)

예제 #7

0

파일 보기

파일: betweenness_centrality.py 프로젝트: jpfairbanks/asonam2013

def scatter_matrix_topp(sorted_frame, selected_axes, percentile=1):
    """

    Arguments:
    - `sorted_frame`:
    - `selected_axes`: the axes to include in .the scatterplot matrix
    - `percentile`:
    """
    pd.scatter_matrix(
        np.log(sorted_frame[selected_axes]+1)[:(percentile*len(sorted_frame)/100)]
        )

예제 #8

0

파일 보기

파일: default.py 프로젝트: dmnfarrell/epitopemap

def scoreCorrelations(preds):
    figs=[]
    for p in preds:
        pred=preds[p]
        df=pred.data
        x = df.pivot_table(index='peptide', columns='allele', values=pred.scorekey)
        f=plt.figure()
        ax=f.add_subplot(111)
        pd.scatter_matrix(x, alpha=0.2, figsize=(12,12), diagonal='hist',ax=ax)
        #plt.tight_layout()
        figs.append(f)
    return figs

예제 #9

0

파일 보기

파일: gmm_mixture_sampling.py 프로젝트: kadeng/pymc

def plot_scatter_matrix(title, tr, fig=None):
    if (fig is None):
        fig = plt.Figure()
    t6 = pandas.Series(tr['c'])
    t8 = pandas.Series(tr['gmm'][:,0])
    t9 = pandas.Series(tr['gmm'][:,1])
    t10 = pandas.Series(tr['gmm_p'][:,0])
    t11 = pandas.Series(tr['pbeta'])
    df = pandas.DataFrame({'cat' : t6, 'gmm_0' : t8, 'gmm_1' : t9, 'p' : t10, 'pbeta' : t11})
    pandas.scatter_matrix(df)
    plt.title(title)
    return fig

예제 #10

0

파일 보기

파일: Graphs.py 프로젝트: alanhdu/Dex

    def createMatrix(self, event):
        # TODO Fix ugly gridlines. sns.setStyle('nogrid') failed
        dlg = GraphDialog(self.parent, "Matrix Plot Input", ("Select Data",), 
                size=(500, 300), groups=False)

        if dlg.ShowModal() == wx.ID_OK:
            ds = [d[0] for d in dlg.GetName()]
            df = self.parent.data[ds]
            n = len(ds)
            dlg.Destroy()
                
            pd.scatter_matrix(df, grid=False)
            plt.show()

예제 #11

0

파일 보기

파일: PrepareData.py 프로젝트: LevinJ/machine-learning

    def performScaling(self):
        self.log_data = pd.DataFrame(np.log(self.data), columns=self.data.columns)
        self.log_samples = pd.DataFrame(np.log(self.samples), columns=self.samples.columns)
        fname = "customers_log.csv"
        if not os.path.isfile(fname):
            self.log_data.to_csv(fname)
            scaler = preprocessing.StandardScaler()
            self.data_log_std = pd.DataFrame(scaler.fit_transform(self.log_data), columns=self.log_data.columns)
            self.data_log_std.to_csv("customers_log_std.csv")
        
        pd.scatter_matrix(self.log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde')
        print(self.log_samples)
#         plt.show()
        return

예제 #12

0

파일 보기

파일: visualization.py 프로젝트: Saynah/AthosPy

def plot_feature_scatter(df_feat, df_files, write_dst=''):
    '''Plot scatter matrix for all features.
    Save Exercise-labeled version of scatter plot for inspection'''
    
    # visualize features in the test set
    ax = pd.scatter_matrix(df_feat, alpha=0.2, figsize=(15, 15), diagonal='kde');

    # remove axis labels
    for axi in ax:
        for axij in axi:    
            axij.set_yticks([])
            axij.set_xticks([])

    if write_dst:
        # also create and save a version of this plot with points colored by exercise label
        df_labeled = df_feat.join(df_files.Exercise)

        g = sns.PairGrid(df_labeled, hue="Exercise")
        g.map_upper(plt.scatter, alpha=0.2)
        g.map_diag(plt.hist)
        # g.map_lower(sns.kdeplot, alpha=0.2, cmap='Greys_d')  # trouble calculating the kde

        g.add_legend()
        g.savefig(write_dst)
        plt.close() # don't create the plot here

    return ax

예제 #13

0

파일 보기

파일: master_plotter.py 프로젝트: nhu2000/carl_capstone

def make_scatter_plots(features_of_interest, df):
    '''
    This function makes bivariate scatter matrix plot for the
    inputed features of interest, which are typically the 
    individual features of the greatest importance in our 
    supervised learning classification model
    INPUTS: features_of_interest = list of strings; df =  pandas
    data frame containing song feature data
    '''
    plt.figure
    '''get mask containing songs used in our model'''
    good_mask = np.load('good_mask.npy')
    df = df[good_mask]
    contains_outliers = 'B- Var(c.t.)'
    '''
    remove outliers in the 'B- Var(c.t.)' feature to better see plots
    '''
    df = df[np.abs(df[contains_outliers]\
    - df[contains_outliers].mean()) / df[contains_outliers].std() <= 2.3 ]
    df_trunc = df[features_of_interest]
    color_dict = dict()
    '''label data points by color'''
    color_dict['tec'] = 'b'
    color_dict['hip'] = 'r'
    color_dict['cla'] = 'g'
    color_dict['roc'] = 'k'
    color_dict['pop'] = 'c'
    color_set = np.array([color_dict[name] for name in df['Label']])
    ax = pd.scatter_matrix(df_trunc, color = color_set)
    plt.xlabel([])
    plt.ylabel([])

예제 #14

0

파일 보기

파일: linear_regression.py 프로젝트: askerry/thinkful

def exploratory_viz(loansData):
    plt.figure()
    p = loansData['FICO.Score'].hist()
    plt.savefig('../figs/fico_score_hist.png')

    a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(14,14))
    plt.savefig('../figs/loan_scatter_matrix.png')

예제 #15

0

파일 보기

파일: linear_regression.py 프로젝트: hughdbrown/linear_regression

def plot_data(loansData):
    plt.figure()
    p = loansData['FICO.Score'].hist(bins=20)
    plt.show()

    a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist')
    plt.show()

예제 #16

0

파일 보기

파일: analyse_locations.py 프로젝트: jaron/deeper-learning

def scale_features(property_data, samples):

	# Scale the data using the natural logarithm
	log_data = property_data
	log_data['Price'] = np.log(property_data['Price'])

	# Scale the sample data using the natural logarithm
	log_samples = samples
	log_samples['Price'] = np.log(samples['Price'])
	print "\nSamples after scaling:"
	display(log_samples)

	# Produce a scatter matrix for each pair of newly-transformed features
	pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14, 8), diagonal = 'kde')
	plt.show()
	return log_data, log_samples

예제 #17

0

파일 보기

파일: explore.py 프로젝트: bchaplin1/hazard

def visualize(data):
    # visualization
    import seaborn as sns
    import matplotlib.pyplot as plt

    # scatter matrix in Seaborn
    sns.pairplot(data)

    # scatter matrix in Pandas
    pd.scatter_matrix(data, figsize=(12, 10))

    # Use a **correlation matrix** to visualize the correlation between all numerical variables.

    # compute correlation matrix
    data.corr()

    # display correlation matrix in Seaborn using a heatmap
    sns.heatmap(data.corr())

예제 #18

0

파일 보기

파일: problem4a.py 프로젝트: GucciTheCarpenter/CUSP

def openFile(filename):
	
	df_genes = pd.read_csv(filename)
	df_genes2 = df_genes[['A', 'C', 'D', 'B']]
	
	# print df_genes2.head()
	# plt.show()
	gene_scatter = pd.scatter_matrix(df_genes2)
	gene_scatter
	plt.show()

예제 #19

0

파일 보기

파일: machine_learn.py 프로젝트: ktalik/ml-and-conflict-prevention-python

def colored_scatter_matrix(data, colors, title, save=None):
    """ Scatter matrix with parametrized colors (e.g. classes) """
    print 'Plot scatter matrix...'
    fig, ax = plt.subplots(figsize=(12.0, 7.5))
    pd.scatter_matrix(
        data,
        diagonal='kde',
        figsize=(10, 10),
        ax=ax,
        c=colors,
        cmap=None
    )
    ax.set_title(title)
    if save:
        fig.savefig(save)
    else:
        mng = plt.get_current_fig_manager()
        mng.window.showMaximized()
        plt.show()

예제 #20

0

파일 보기

파일: iris_classifier.py 프로젝트: muyun/dev.machinelearning

def get_iris_dataset():
    iris_dataset = load_iris()

    #1. The format of the dataset
    print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))
    print("Target names: {}".format(iris_dataset['target_names']))
    print("Feature names: \n{}".format(iris_dataset['feature_names']))
    # data -> numpy.ndarray
    # row -> the labels
    # column -> the features
    print("Type of data: {}".format(iris_dataset['data'].shape))     # (150,4)
    print("Type of target: {}".format(iris_dataset['target'].shape)) # (150,)

    #import pdb; pdb.set_trace()

    #2. split the dataset into training set and testing set
    # y = f(X)
    X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'],test_size=0.2, random_state=0)
    print("X_train shape: {}".format(X_train.shape))
    print("y_train shape: {}".format(y_train.shape))

    print("X_test shape: {}".format(X_test.shape))
    print("y_test shape: {}".format(y_test.shape))


    #import pdb; pdb.set_trace()

    # 3. inspect the data - virtualize it
    # convert Numpy array int oa pandas DataFrame
    iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)

    # pdb; pdb.set_trace()
    grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15,15), marker='o', hist_kwds={'bins':20}, s=60, alpha=.8, cmap=mglearn.cm3)
    plt.show()


    #import pdb; pdb.set_trace()

    # The modelu
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=1)
    # build the model on the training set
    knn.fit(X_train, y_train)

    # the prediction
    X_new = np.array([[5, 2.9, 1, 0.2]])
    prediction = knn.predict(X_new)
    print("Prediction: {}".format(prediction))
    print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

    y_pred = knn.predict(X_test)
    print("Test set predictions:\n {}".format(y_pred))
    print("Test set score: {:.2f}".format(np.mean(y_pred==y_test)))

예제 #21

0

파일 보기

파일: u3l3_education_gdp_analysis.py 프로젝트: awaemmanuel/thinkful_datascience

def data_analysis_and_correlation(df_education, df_gdp):
    """ Analysis and Correlation education data with gdp. """
    print "[Data Analysis and Correlation of Education to GDP data] ==> Begin"
    common_countries = list(set(df_education['Country'].tolist()) & set(df_gdp['Country'].tolist()))
    gdp = []
    total_school_time = []
    men_school_time = []
    women_school_time = []
    for cntry in common_countries:
        df1 = df_education[df_education['Country'] == cntry]
        df2 = df_gdp[df_gdp['Country'] == cntry]
        if df2['GDP_'+ df1['Year'].iloc[0]].iloc[0] != '':
            total_school_time.append(int(df1['Total_School_Time'].iloc[0]))
            men_school_time.append(int(df1['Men_School_Time'].iloc[0]))
            women_school_time.append(int(df1['Women_School_Time'].iloc[0]))
            gdp.append(math.log(df2['GDP_'+ df1['Year'].iloc[0]].iloc[0]))
    df_edu_to_gdp = pd.DataFrame({'Total': total_school_time, 'Men': men_school_time, \
                                  'Women': women_school_time, 'GDP': gdp})    
    print df_edu_to_gdp.corr(), "\n"
    
    gdp_np_array = np.array(df_edu_to_gdp.GDP.tolist())
    for col in ['Women', 'Men', 'Total']:
        r_val, p_val = sp(gdp_np_array, np.array(df_edu_to_gdp[col].tolist()))
        print "Correlation of GDP against {}:".format(col)
        print "Pearsons correlation coefficient: {}".format(r_val)
        print "2-tailed p-values: {}\n".format(p_val)
        
    # Scatter matrix plot with histogram of data plots in the diagonal
    pd.scatter_matrix(df_edu_to_gdp, alpha=0.05, figsize=(10, 10), diagonal='hist')
    plt.savefig('figures/education_to_gdp/data_education_gdp_analysis.png')
    plt.clf()
#     
#         ==> Conclusion / Summary
#                    GDP       Men     Total     Women
#        GDP    1.000000  0.495794  0.479050  0.497923
#        Men    0.495794  1.000000  0.971663  0.942572
#        Total  0.479050  0.971663  1.000000  0.977217
#        Women  0.497923  0.942572  0.977217  1.000000
#       
    print """

예제 #22

0

파일 보기

파일: plots.py 프로젝트: mohanbolisetty/trans-seq

def scattermatrix(tables):
    fig = plot.figure(frameon=False,facecolor='white')
    index=common_index(tables)
    data=pd.DataFrame(index=index)
    for i in tables:
        data[i[0]]=i[1].ix[index]['MEDIAN']
    axs=pd.scatter_matrix(data, alpha=0.2, figsize=(8,8), diagonal='none', marker='.',)
    
    for ax in axs[:,0]:
        ax.grid('off', axis='both')
        ax.set_ylabel(wrap(ax.get_ylabel()), rotation=0, va='center', labelpad=30)
        ax.set_yticks([])
    for ax in axs[-1,:]:
        ax.grid('off', axis='both')
        ax.set_xlabel(wrap(ax.get_xlabel()), rotation=90)
        ax.set_xticks([])
    return fig

예제 #23

0

파일 보기

파일: plotting.py 프로젝트: twedlee/pandastable

    def _doplot(self, data, ax, kind, subplots, kwargs):
        """Do core plotting"""

        cols = data.columns
        rows = int(round(np.sqrt(len(data.columns)),0))
        if len(data.columns) == 1:
            kwargs['subplots'] = 0
        if kind == 'pie':
            kwargs['subplots'] = True
        if subplots == 0:
            layout = None
        else:
            layout=(rows,-1)
        if kind == 'bar':
            if len(data) > 50:
                ax.get_xaxis().set_visible(False)
            if len(data) > 400:
                print ('too many bars to plot')
                return
        if kind == 'scatter':
            axs = self.scatter(data, ax, **kwargs)
            if kwargs['sharey'] == 1:
                lims = self.fig.axes[0].get_ylim()
                for a in self.fig.axes:
                    a.set_ylim(lims)
        elif kind == 'boxplot':
            axs = data.boxplot(ax=ax, rot=kwargs['rot'], grid=kwargs['grid'])
            #boxplot won't accept required kwargs?
            if kwargs['logy'] == 1:
                ax.set_yscale('log')
        elif kind == 'histogram':
            bins = int(kwargs['bins'])
            axs = data.plot(kind='hist',layout=layout, ax=ax, **kwargs)
        elif kind == 'heatmap':
            axs = self.heatmap(data, ax, kwargs)
        elif kind == 'bootstrap':
            axs = plotting.bootstrap_plot(data)
        elif kind == 'scatter_matrix':
            axs = pd.scatter_matrix(data, ax=ax, **kwargs)
        elif kind == 'hexbin':
            x = cols[0]
            y = cols[1]
            axs = data.plot(x,y,ax=ax,kind='hexbin',gridsize=20,**kwargs)
        else:
            axs = data.plot(ax=ax, layout=layout, **kwargs)
        return axs

예제 #24

0

파일 보기

def realiseData():
    data = pd.read_csv(csvPath)
    # data.plot(kind='density', subplots=True, layout=(3, 3), sharex=False)
    pd.scatter_matrix(data)
    plt.show()

예제 #25

0

파일 보기

from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
"""
DBSCN
    describe:
       核心对象： 某个点的密度达到阈值(minPts)则为核心点
       邻域阈值(r)

       传销算法


"""

colors = np.array(['red', 'green', 'blue', 'yellow'])

# 读取数据
beer = pd.read_csv('./data/data.txt', sep=' ')
X = beer[["calories", "sodium", "alcohol", "cost"]]

# dbscan
db = DBSCAN(eps=10, min_samples=2).fit(X)

beer['cluster_db'] = db.labels_
beer.groupby('cluster_db').mean()

pd.scatter_matrix(X, c=colors[beer.cluster_db], figsize=(10, 10), s=100)
plt.show()

예제 #26

0

파일 보기

df.groupby('species').agg(np.mean)
df.groupby('species').agg([np.min, np.max])
df.groupby('species').describe()

# explore data by sorting, looking for differences between species
df.sort_index(by='sepal_length').values
df.sort_index(by='sepal_width').values
df.sort_index(by='petal_length').values
df.sort_index(by='petal_width').values

# explore data visually, looking for differences between species
df.petal_width.hist(by=species, sharex=True)
df.boxplot(column='petal_width', by='species')
df.boxplot(by='species')
df.plot(x='petal_length', y='petal_width', kind='scatter', c=iris.target)
pd.scatter_matrix(df, c=iris.target)

## PART 2: Write a function to predict the species for each observation

# create a dictionary so we can reference columns by name
col_ix = {col: index for index, col in enumerate(df.columns)}


# define function that takes in a row of data and returns a predicted species
def classify_iris(data):
    if data[col_ix['petal_length']] < 3:
        return 'setosa'
    elif data[col_ix['petal_width']] < 1.8:
        return 'versicolor'
    else:
        return 'virginica'

예제 #27

0

파일 보기

파일: OKC Assessment code.py 프로젝트: anilbulusu/All-NBA-selections

scaled_df.describe()


# In[21]:


# Correlation matrix
scaled_df.corr()


# In[22]:


# Correlation plots
pd.scatter_matrix(scaled_df, figsize=(22,22))
plt.show()


# In[23]:


# Correlation heatmap
sns.set(rc={'figure.figsize':(80,10)})

corr = scaled_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True

예제 #28

0

파일 보기

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

wine = pd.read_csv('/Users/Shared/py/winequality-red.csv', sep=';')

clf = linear_model.LinearRegression()

X = wine.drop(['quality'], axis=1)

Y = wine['quality']

clf.fit(X, Y)

print(clf.coef_)
print(clf.intercept_)

print(
    pd.DataFrame({
        "Name": X.columns,
        "Coefficients": clf.coef_
    }).sort_values(by='Coefficients'))
plt.matshow(wine.corr())
pd.scatter_matrix(wine)
plt.scatter(X, Y)

예제 #29

0

파일 보기

파일: fruits.py 프로젝트: suman12345678/datascienceworknew

lookup_fruit_name = dict(
    zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))
lookup_fruit_name

#split the data in test and traing with the target variable fruit_label,random_state like seed in R
X = fruits[['height', 'width', 'mass', 'color_score']]
y = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#visualize data as pairs scatterplot of al independent variable relation with target
from matplotlib import cm
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X_train,
                            c=y_train,
                            marker='o',
                            s=40,
                            hist_kwds={'bins': 15},
                            figsize=(9, 9),
                            cmap=cmap)

#visualize in 3d
# plotting a 3D scatter plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_train['width'],
           X_train['height'],
           X_train['color_score'],
           c=y_train,
           marker='o',

예제 #30

0

파일 보기

ufo_cols = ufo.columns.tolist()
ufo_cols = [names.replace(' ', '_') for names in ufo.columns.tolist()]
ufo_cols2 = [names.replace(' ', '_') for names in ufo.columns]
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns = ufo_cols
# ufo.Location = ufo.City + ', ' + ufo.State
ufo['Location'] = ufo.City + ', ' + ufo.State

users = pd.read_table('u.user', sep='|', index_col='user_id')
users.groupby('occupation').count()
users.occupation.value_counts()
users.groupby('occupation').age.mean()
users.groupby('occupation').age.agg(['min', 'max'])
users.groupby(['occupation', 'gender']).age.mean()
users.groupby(['occupation', 'gender']).age.agg(['mean', 'count'])

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)

drinks[['beer', 'wine']].sort('beer').values
drinks.plot(kind='scatter', x='beer', y='wine', alpha=.3)
plt.xlabel('Beer')
plt.ylabel('Wine')
pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8))
plt.style.use('ggplot')
drinks.continent.value_counts().plot(kind='bar')
drinks.groupby('continent').mean().plot(kind='bar', figsize=(10, 8))
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar')
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar',
                                                               stacked=True)

예제 #31

0

파일 보기

def scatterplot(data, title=None, color=None):
    pd.scatter_matrix(data, alpha=0.3, diagonal='kde', color=color)
    if title is not None:
        plt.suptitle(title)
    plt.show()

예제 #32

0

파일 보기

파일: 06_regression_class.py 프로젝트: zehndec/DAT3

# Plot the data (similar to before)
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)
"""
COMMON PROBLEMS - Multicollinearity
"""

# Now let's run a multiple linear regression
# The temp variable is no longer significant. Why? Multicollinearity
est_m = smf.ols(formula='cnt ~ atemp + temp + workingday + windspeed',
                data=bike_dat).fit()
est_m.summary()

# Scatter plot (observe the (unsurprising) correlation between atemp and temp)
cols = ['cnt', 'atemp', 'windspeed', 'weathersit', 'temp', 'workingday', 'hum']
pd.scatter_matrix(bike_dat[cols])

# Correlation coefficient matrix
corr_matrix = np.corrcoef(bike_dat[cols].T)
sm.graphics.plot_corr(corr_matrix, xnames=cols)

# Let's say we wanted to include an interaction term
# We would do this by including the ':' between interacting variables
est_m = smf.ols(formula='cnt ~ temp + windspeed + temp:windspeed + workingday',
                data=bike_dat).fit()

est_m.summary()

# An alternate way of specifying interaction terms
# a*b is equivalent to a + b + a:b
est_m = smf.ols(formula='cnt ~ temp*windspeed + workingday',

예제 #33

0

파일 보기

파일: lesson6_viz_ts.py 프로젝트: rachidalili/MS-BGD2015

wiki_data = wiki_data.set_index('Date')
wiki_data.index = wiki_data.index.map(lambda x : parse(x))
wiki_data['changes'] = wiki_data['changes'].astype(int)



death_data = pd.read_csv('CausesOfDeath_France_2001-2008.csv')
death_data['Value'] = death_data['Value'].str.replace(' ','')
death_data['Value'] = death_data['Value'].apply(lambda x : int(re.compile(r'[^0-9]').sub('0',x)))
death_data = death_data[['ICD10','Value','SEX','TIME']]

causes = death_data.groupby('ICD10')['Value'].sum().order(ascending=False)[0:5].index.values

filtered = death_data[death_data['ICD10'].isin(causes)]

filtered_agg = filtered.groupby(['ICD10','TIME']).sum()

filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot()
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="bar")
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh")
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh", stacked=True)

cars = pd.read_csv('cars.csv',sep=';',index_col=0).drop('STRING')
cars['MPG'] = cars['MPG'].astype(float)
cars['Cylinders'] = cars['Cylinders'].astype(float)
cars['Weight'] = cars['Weight'].astype(float)
cars['Acceleration'] = cars['Acceleration'].astype(float)
cars['Horsepower'] = cars['Horsepower'].astype(float)
pd.scatter_matrix(cars, diagonal='kde', color='k', alpha=0.3)

예제 #34

0

파일 보기

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ml-bank').getOrCreate()
df = spark.read.csv('bank.csv', header=True, inferSchema=True)
df.printSchema()

import pandas as pd

pd.DataFrame(df.take(5), columns=df.columns).transpose()

numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
print(df.select(numeric_features).describe().toPandas().transpose())

numeric_data = df.select(numeric_features).toPandas()
axs = pd.scatter_matrix(numeric_data, figsize=(8, 8))
n = len(numeric_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n - 1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

df = df.select('age', 'job', 'marital', 'education', 'default', 'balance',
               'housing', 'loan', 'contact', 'duration', 'campaign', 'pdays',
               'previous', 'poutcome', 'deposit')
cols = df.columns
print(df.printSchema())

예제 #35

0

파일 보기

dataset['quality'].unique()#3-9

dataset.head()

dataset.tail()

#To find the statistical summary
dataset.describe()

#Univariate Analysis
dataset.hist()

#Multivariate Analysis
from pandas.tools.plotting import scatter_matrix

pd.scatter_matrix(dataset)

#Group the dependent variable and independent variables
array=dataset.values
X=array[:,0:11]
Y=array[:,11]

#Splitting the dataset into training set and test set
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)


# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)

예제 #36

0

파일 보기

파일: 05_iris_exercise.py 프로젝트: shanniemurd/sklearn-basics

iris.petal_width.hist(by=iris.species, sharex=True)
iris.boxplot(column='petal_width', by='species')
iris.boxplot(by='species')

# map species to a numeric value so that plots can be colored by category
iris['species_num'] = iris.species.map({
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica': 2
})
iris.plot(kind='scatter',
          x='petal_length',
          y='petal_width',
          c='species_num',
          colormap='Blues')
pd.scatter_matrix(iris, c=iris.species_num)

## TASK 4

# If petal length is less than 3, predict setosa.
# Else if petal width is less than 1.8, predict versicolor.
# Otherwise predict virginica.

## BONUS


# define function that accepts a row of data and returns a predicted species
def classify_iris(row):
    if row[2] < 3:  # petal_length
        return 0  # setosa
    elif row[3] < 1.8:  # petal_width

예제 #37

0

파일 보기

centers = beer.groupby("cluster3").mean().reset_index()
print(centers)

# 图形化展示聚类效果(k=3)
from pandas import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.size'] = 14
colors = np.array(['red', 'green', 'blue', 'yellow'])
plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster3"]])

plt.scatter(centers.calories,
            centers.alcohol,
            linewidths=3,
            marker='+',
            s=300,
            c='black')

plt.xlabel("Calories")
plt.ylabel("Alcohol")
plt.show()

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]],
               s=100,
               alpha=1,
               c=colors[beer["cluster3"]],
               figsize=(10, 10))
plt.suptitle("With 3 centroids initialized")
plt.show()

예제 #38

0

파일 보기

print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

# create dataframe from data in X_train
# label the columns using the strings in iris_dataset.feature_names

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
# create a scatter matrix from the dataframe, color by y_train
grr = pd.scatter_matrix(iris_dataframe,
                        c=y_train,
                        figsize=(15, 15),
                        marker='o',
                        hist_kwds={'bins': 20},
                        s=60,
                        alpha=.8,
                        cmap=mglearn.cm3)
#pip install mglearn

#Building Your First Model: k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
#knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)

#Making Predictions
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape: {}".format(X_new.shape))

예제 #39

0

파일 보기

파일: titanic_solution.py 프로젝트: krother/python3_grundlagenkurs

sv = df.groupby(['Survived', 'Pclass', 'Sex'])['Name'].count()
sv.unstack().plot.bar()
plt.savefig('bars_gruppen.png')


# 5. Paarplot
def make_col(x):
    """Einfärben nach Überleben"""
    if x == 0:
        return (1, 0, 0)  # rot
    else:
        return (0, 0, 1)  # blau


col = df['Survived'].apply(make_col)
pd.scatter_matrix(df, c=col, figsize=(15, 15))
plt.savefig('paarplot.png')

# 7. Datenaufbereitung
del df['Cabin']
del df['Name']

df = df.dropna()

X = df[['Pclass', 'Age']].values
y = df['Survived'].values

# 8. Modell erstellen
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

m = KNeighborsClassifier(n_neighbors=1)

예제 #40

0

파일 보기

파일: Challenge_ Data Visualization-184.py 프로젝트: JKChang2015/Data-Analysis-Python

ax1.scatter(hollywood_movies["Profitability"], hollywood_movies["Audience Rating"])
ax1.set_xlabel("Profitability")
ax1.set_ylabel("Audience Rating")
ax1.set_title("Hollywood Movies, 2017-2011")
ax2.scatter(hollywood_movies["Audience Rating"], hollywood_movies["Profitability"])
ax2.set_xlabel("Audience Rating")
ax2.set_ylabel("Profitability")
ax2.set_title("Hollywood Movies, 2017-2011")
plt.show()


## 3. Scatter matrix - profitability and critic ratings ##

normal_movies = hollywood_movies[hollywood_movies["Film"] != "Paranormal Activity"]
filtered_movies = normal_movies[["Profitability","Audience Rating"]]
pd.scatter_matrix(filtered_movies,figsize = (6,6))
plt.show()


## 4. Box plot - audience and critic ratings ##

normal_movies.boxplot(column = ["Critic Rating","Audience Rating"])

## 5. Box plot - critic vs audience ratings per year ##

normal_movies = normal_movies.sort_values("Year")
fig = plt.figure(figsize = (8,4))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sns.boxplot(data=normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year",y = "Critic Rating", ax = ax1)
sns.boxplot(data = normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year", y = "Audience Rating", ax = ax2)

예제 #41

0

파일 보기

파일: linear_regression.py 프로젝트: subuv/Data-Science-Projects

    '640', '', '', '', '660', '', '', '', '680', '', '', '', '700', '720', '',
    '', '', '740', '', '', '', '760', '', '', '', '780', '', '', '', '800', '',
    '', '', '820', '', '', '', '840'
])
q0 = p.set_xlabel('FICO Score')
q1 = p.set_ylabel('Interest Rate %')
q2 = p.set_title('Lending Rate Plot')

#Create a new data frame with selected columns for analysing data
loansmin = loansdata.filter([
    'Interest.Rate', 'FICO.Score', 'Loan.Length', 'Monthly.Income',
    'Amount.Requested'
],
                            axis=1)

a = pd.scatter_matrix(loansmin, alpha=0.05, figsize=(10, 10), diagonal='hist')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(10, 10), diagonal='kde')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(8, 8), diagonal='kde')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(12, 12), diagonal='kde')

interest_rate = loansmin['Interest.Rate']
loan_amount = loansmin['Amount.Requested']
fico_score = loansmin['FICO.Score']

y = np.matrix(interest_rate).transpose()
x1 = np.matrix(fico_score).transpose()
x2 = np.matrix(loan_amount).transpose()

x = np.column_stack([x1, x2])

X = sm.add_constant(x)

예제 #42

0

파일 보기

파일: ml_proc_20160525b.py 프로젝트: rghiglia/ML_Process

#
#    plt.xticks(np.arange(len(frame)), values)
#    plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8)
#
## Common attributes for plot formatting
#plt.xlabel(key)
#plt.ylabel('Number of Passengers')
#plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key))
#plt.show()



# Then look at correlations
# This will also be quite problem-specific since mixture of variables are tricky
# In principle I'd like to see some joint stats
pd.scatter_matrix(data_trn, alpha=0.3, figsize=(5,6), diagonal='kde');
# In case of mixed data this really doesn't give you a good sense of relationships
# I guess you might split into continuous and categorical, but still how about the relationship between continuous and categorical?
# Note: L-shaped pairs of variables: if you sum or take the product you get stuff that is more constant or maybe linear, maybe it tells you something
# You have all kind of 'garbage' continuous with categorical or binary and 
# all combos of those

# Maybe you can try to see a pair and the class
clr = ['r', 'b', 'y', 'm', 'c', 'k']
col_i = 'SibSp'
col_j = 'Parch'
# Adding some random noise to distinguish the dots
Z = DataFrame(np.random.rand(nTrn,2), index=data_trn.index)
dxy = 0.45
for j in range(len(set(y_trn))):
    ix = y_trn==j

예제 #43

0

파일 보기

파일: 02_pandas.py 프로젝트: rajgottipati/gopi

drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', alpha=0.3)

# same scatterplot, except point color varies by 'spirit_servings'
# note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0
drinks.plot(kind='scatter',
            x='beer_servings',
            y='wine_servings',
            c='spirit_servings',
            colormap='Blues')

# same scatterplot, except all European countries are colored red
colors = np.where(drinks.continent == 'EU', 'r', 'b')
drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', c=colors)

# scatterplot matrix of all numerical columns
pd.scatter_matrix(drinks)
'''
Advanced Filtering (of rows) and Selecting (of columns)
'''

# loc: filter rows by LABEL, and select columns by LABEL
users.loc[1]  # row with label 1
users.loc[1:3]  # rows with labels 1 through 3
users.loc[1:3,
          'age':'occupation']  # rows 1-3, columns 'age' through 'occupation'
users.loc[:,
          'age':'occupation']  # all rows, columns 'age' through 'occupation'
users.loc[[1, 3], ['age',
                   'gender']]  # rows 1 and 3, columns 'age' and 'gender'

# iloc: filter rows by POSITION, and select columns by POSITION

예제 #44

0

파일 보기

파일: pres2012.py 프로젝트: wandergram/datsci

# performance in Paris
pres[pres.dep=="PARIS"]

''' VISUALIZATION '''

pres.ump.plot(kind='hist', bins=20)
pres.ps.plot(kind='hist', bins=20)
pres.fn.plot(kind='hist', bins=20)

pres[['ump', 'ps']].sort('ump').values
pres.plot(kind='scatter', x='ps', y='ump') # fits hypothesis: higher UMP votes, lower PS votes
pres.plot(kind='scatter', x='ump', y='fn') # line not as evident; but votes may have been interchangeable

# demonstration of vote distribution relationships between binomes
pd.scatter_matrix(pres[['ump', 'ps', 'fn']], figsize=(10, 8))

pres[['ump', 'ps', 'fn']].plot(kind='hist', stacked=True)

# testing hypothesis of voters "so far on the left they come out on the (far) right"
pd.scatter_matrix(pres[['fn', 'ug1', 'ug2']], figsize=(10, 8))
# ^^ it works!

pd.scatter_matrix(pres[['fn', 'ug2', 'ug3']], figsize=(10, 8))


'''
Data source: http://data.gouv.fr

Data desc:

예제 #45

0

파일 보기

파일: process_all_datasets.py 프로젝트: juanerolon/health-data-manip

    print("acc_train = {}, acc_test ={}".format(acc_train, acc_test))
    print("Confusion Matrix:\n{}\n\n {} \n".format(CML, CM))
    print("f1_train = {}, f1_test ={}".format(f1_train, f1_test))
    print("fbeta_train = {}, fbeta_test ={}".format(fb_train, fb_test))
    print("ROC_AUC_train = {}, ROC_AUC_test ={}".format(
        roc_auc_train, roc_auc_test))

############################################# Initial Visual Tests #####################################################
########## ScatterMatrixPlot ##########

if False:
    #Transformed features

    pd.scatter_matrix(biochemistry_data,
                      alpha=0.3,
                      figsize=(16, 8),
                      diagonal='kde')
    plt.show()

if False:

    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    from sklearn.decomposition import PCA

    ndims = 2
    dim_labels = []
    for i in range(1, ndims + 1):
        dim_labels.append("Dimension {}".format(i))

예제 #46

0

파일 보기

파일: ex9.py 프로젝트: zechfox/isl_exercises

fileName = r'../dataSet/Auto.csv'
#if 'coerce', then invalid parsing will be set as NaN
df = pd.read_csv(fileName)
df_numeric = df.apply(pd.to_numeric, args=('coerce',))
mask = ~np.isnan(df_numeric['cylinders'].values) & ~np.isnan(df_numeric['displacement'].values)\
       & ~np.isnan(df_numeric['horsepower'].values) & ~np.isnan(df_numeric['weight'].values)\
       & ~np.isnan(df_numeric['acceleration'].values) & ~np.isnan(df_numeric['year'].values)\
       & ~np.isnan(df_numeric['origin'].values)
X_raw = df_numeric[['cylinders','displacement','horsepower','weight','acceleration','year','origin']][mask]
y = df_numeric['mpg'][mask]

X = sm.add_constant(X_raw)
est = sm.OLS(y,X).fit()
print('Exercise 9 Answer:')
print('(a) see figure 1')
pd.scatter_matrix(df, alpha=0.5)
print('(b) ')
#correlations = np.corrcoef(pd.concat([y, X_raw], axis=1), rowvar=0)
correlations = np.corrcoef(df_numeric.loc[:,'mpg':'origin'][mask], rowvar=0)
print('(c)')
print(est.summary())
print('(c) i. The null-hypersis of all the regression coefficients are zero can be reject by large F-statistic with very small P-value.')
print('(c) ii. From P-value of each predictor, all predictor has statistically significant relationship to the response except cylinders, horsepower and acceleration.')
print('(c) iii. The coefficient of year show positive relationship. And increase of 1 year gain 0.7508 increase of mpg. It\'s means cars become more fuel efficient by year.')
print('(d) see figure 2.')
plt.figure(2)
# R plot for lm object will generate 6 plots: residuals against fitted values, sqrt(|residuals|) against fitted values, Normal Q-Q plot,
#Cook's distances versus row lables, residuals against leverages, and Cook's distances against leverage. By default, the first 3 and 5 are provided
# we plot default by python   

#residuals vs fitted values

예제 #47

0

파일 보기

파일: schiffe.py 프로젝트: krother/python3_grundlagenkurs


# Aufgabe 2
#
# Verschaffe Dir einen Überblick
# über die Werte der Spalten *Art* und *Status*.
print("\nArten von Schiffen:")
print(df['Art'].value_counts())
print("\nStatus von Schiffen:")
print(df['Status'].value_counts())


# Aufgabe 3
#
# Schaue nach möglichen Korrelationen.
pd.scatter_matrix(df)
plt.savefig('matrix.png')


# Aufgabe 4
#
# Plotte Länge gegen Höhe als Streudiagramm.
df.plot.scatter('Länge', 'Höhe')
plt.savefig('scatter.png')


# Aufgabe 5
#
# Einer der Einträge enthält einen **Datenfehler**.
print("\nEintrag mit Datenfehler:")
print(df.ix['HMS Hood'].transpose())

예제 #48

0

파일 보기

import scipy
import numpy
import matplotlib
import pandas
import sklearn

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

# head
print(dataset.head(20))

# descriptions
print(dataset.describe())

dataset.plot(kind='box',
             subplots=True,
             layout=(2, 2),
             sharex=False,
             sharey=False)
matplotlib.pyplot.show()

dataset.hist()
matplotlib.pyplot.show()

# scatter plot matrix
pandas.scatter_matrix(dataset)
matplotlib.pyplot.show()

예제 #49

0

파일 보기

    score = r2_score(pred, y_test)

    scores.append(score)

#calculate mean of all 1000 scores
score = np.mean(scores)
print "\nR^2 score for predicting Milk is: ", score

#OBSERVATION
#A low r^2 value indicates that it cannot be predicted with too much accuracy using all the feautres we have. However since there is a positive value, there must be some features which can predict its value to a higher accuracy and hence it fits the data. So we should keep this feature for identifying customer habits.

##################################################################################################################
''' VISUALIZATION OF FEATURE DATA'''

#viualize data with diagnol showing data distribution
pd.scatter_matrix(data, alpha=0.3, figsize=(14, 8), diagonal='kde')
#plt.show()
'''FEATURE SCALING USING LOG'''

# Scale the data using the natural logarithm
log_data = np.log(data)

# Scale the sample data using the natural logarithm
log_samples = np.log(samples)

# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha=0.3, figsize=(14, 8), diagonal='kde')
plt.show()

print "\nScaled sampled data:\n"
print log_samples

예제 #50

0

파일 보기

파일: 8_Plotting_and_Visulization.py 프로젝트: holmes1313/python_for_data_science

# Scatter plots
macro = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \
                    '\macrodata.txt')
macro.head()

data = macro[['cpi','m1','tbilrate','unemp']]
data.head()

data.head()
trans_data = np.log(data).diff().dropna()

plt.scatter(trans_data['m1'], trans_data['unemp'])
plt.title('Changes in log %s vs. log %s' % ('m1','unemp'))

pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)


#  Plotting map

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \
                   '\ch08_Haiti.csv')
data.info()
data.head()
data.shape
data.columns

data[['INCIDENT DATE', 'LATITUDE','LONGITUDE']][:10]

예제 #51

0

파일 보기

파일: Bokito_GapFractionCorrelation.py 프로젝트: itohanosa/CanopyGapPrediction

colors = Bok_GmGFs['VV-VH']

plt.scatter(Bok_GmGFs['gap_fraction'], Bok_GmGFs['VH-VVnorm'], c=colors, alpha=0.3, cmap='viridis')
plt.ylabel("Normalized VH-VV Backscatter (Gamma0 dB)")
plt.xlabel("Canopy Gap Fraction")
plt.colorbar();
plt.savefig("Correlation_VH-VVNormVsGapFraction.tiff", dpi=300)
plt.savefig("Correlation_VH-VVNormVsGapFraction.pdf", dpi=300)
#plt.legend()

Bok_GmGFs2 = Bok_GmGFs.drop('Year', 1)
Bok_GmGFs2 = Bok_GmGFs2.drop(Bok_GmGFs2.columns[[0, 1]], axis=1)

Bok_GmGFs2 = pd.DataFrame(Bok_GmGFs2)
pd.scatter_matrix(Bok_GmGFs2, alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show
plt.savefig("Scatter_Gamma0_Bands_GapFraction.tiff", dpi=300)
plt.savefig("Scatter_Gamma0_Bands_GapFraction.pdf", dpi=300)


pp = sns.pairplot(data = Bok_GmGFs,
                  y_vars =['gap_fraction'],
                  x_vars = ['VH-VVnorm','VVVHratio','VV-VH'])
plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.tiff", dpi=300)
plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.pdf", dpi=300)

Bok_GmGFs.describe() # ger summary statistics of each variable in Bok_GmGFs

'''
PlotID', 'SARdate', 'VHgamma0', 'VVgamma0', 'VHdb', 'VVdb', 'VV-VH',

예제 #52

0

파일 보기

scatter_matrix(dataset)

# Plotting Graph
plt.scatter(dataset['total_rooms'], dataset['total_bedrooms'])
plt.show()

plt.scatter()
x = np.arange(-10, 10, 0.01)
y = 0.7 * x + 5
plt.plot(x, y)
plt.show()

y1 = 0.7 * x**2 + x + 8
plt.plot(x, y1)
plt.show()

sig_y = 1 / (1 + np.power(np.e, -x))
plt.plot(x, sig_y)
plt.show()

a = np.random.randn(10)
b = np.random.randn(5, 5)

pd.scatter_matrix(dataset.loc[:, :])
pd.show_versions(as_json=False)

corr_mat = dataset.corr()
sns.heatmap(corr_mat, annot=True)

np.arange(23, 55, 2)
np.linspace(0, 100, 6)

예제 #53

0

파일 보기

파일: 05_pandas.py 프로젝트: RobertMarton/DAT4

# boxplot of beer servings by continent (shows five-number summary and outliers)
drinks.boxplot(column="beer_servings", by="continent")

# scatterplot of beer servings versus wine servings
drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", alpha=0.3)

# same scatterplot, except point color varies by 'spirit_servings'
# note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0
drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", c="spirit_servings", colormap="Blues")

# same scatterplot, except all European countries are colored red
colors = np.where(drinks.continent == "EU", "r", "b")
drinks.plot(x="beer_servings", y="wine_servings", kind="scatter", c=colors)

# scatterplot matrix of all numerical columns
pd.scatter_matrix(drinks)


"""
Advanced Filtering (of rows) and Selecting (of columns)
"""

# loc: filter rows by LABEL, and select columns by LABEL
users.loc[1]  # row with label 1
users.loc[1:3]  # rows with labels 1 through 3
users.loc[1:3, "age":"occupation"]  # rows 1-3, columns 'age' through 'occupation'
users.loc[:, "age":"occupation"]  # all rows, columns 'age' through 'occupation'
users.loc[[1, 3], ["age", "gender"]]  # rows 1 and 3, columns 'age' and 'gender'

# iloc: filter rows by POSITION, and select columns by POSITION
users.iloc[0]  # row with 0th position (first row)

예제 #54

0

파일 보기

파일: Data Preparation.py 프로젝트: GajdoM/Thesis

ForwardU = Forward1.loc[Forward1.Status=='UFA',:]
ForwardR = Forward1.loc[Forward1.Status=='RFA',:]

Correlation

#goalies 
#correlation across category
Gcor = G1617.loc[:, ['Ovrl', 'SV%', 'Supp', 'ReMin', 'HighSV%', 'PP SV%', 'FA', 'SO SV%', 'Cap Hit', 'Ginj']]
Gcor.corr()
plt.matshow(Gcor.corr())
plt.xticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue', rotation = 'vertical')
plt.yticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue')
plt.colorbar()
plt.show()

pd.scatter_matrix(Gcor, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue')
plt.show()

#correlation from one select category
Gcor2 = G1617.loc[:, ['GP', 'W', 'L', 'SA', 'SV', 'GA', 'SV%']]
Gcor2.cov()
plt.matshow(Gcor2.corr())
plt.xticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue', rotation = 'vertical')
plt.yticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue')
plt.colorbar()
plt.show()

pd.scatter_matrix(Gcor2, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue')
plt.show()

#players

예제 #55

0

파일 보기

파일: Plotting.py 프로젝트: AntHar/DAT4-students


'''
PLOTS
'''
'''
Creates a df with only the numerical columns for a scatter matrix

RESULT: Nearly all of the independent variables follow some sort of power law distribution
'''
Numerical_df = Master_df[['Num_Adv_Event','Num_Serious',
                'Num_Other','Num_Life_Threat','Num_Hosp',
                'Num_Congen_Anom','Num_Disable','Num_Deaths',
                'Num_Male','Num_Female','AE_Per_Year','Adj_Num_AE', 'Adj_Per_Year']]

pd.scatter_matrix(Numerical_df, diagonal='kde')

'''
Correlation matrix

RESULT: Num_Adv_Event is highly correlated (>0.60 with every other column
except for Num_Congen_Anom, Num_Disable and Num_Deaths
'''
Corr_matrix = Master_df.corr()
Corr_matrix.to_csv('C:\Users\jonbryan90\Desktop\Corr_Matrix')

'''
Density plots by Innovation_Cat for the promising variabes (Num_Adv_Event, Num_Congen, Num_Disabe, Num_Deaths)
'''
Master_df.groupby('Innovation_Cat').Num_Adv_Event.plot(kind='kde',
                                                      linewidth=2.5,

예제 #56

0

파일 보기

파일: EmailCampaign.py 프로젝트: SanjeevSukumaran1990/emailcampaignprediction

        'id', 'RR', 'C_S', 'U_U_C', 'A_D_R_R', 'a_d_i_r', 'a_d_a_r_r',
        'a_u_d_a_r_r', 'mb_s', 'mb_e', 'mb_sub', 'mb_esec', 'mb_inp',
        'mb_insec', 'mb_uneng', 'mb_idles'
    ]
    ax.set_xticklabels(labels, fontsize=10)
    ax.set_yticklabels(labels, fontsize=6)
    ax.matshow(corr)

    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)


plot_corr(input, 15)

from pandas import scatter_matrix
scatter_matrix(input, diagonal='kde')

san = input.corr()
corr = pd.DataFrame(san)
#plotting categorical variables

san = input.day
san.value_counts().plot(kind='bar')

#looking for unique domains:
len(set(input.from_domain_hash))

#sendex approach

# anova test for weekly data
from statsmodels.formula.api import ols

예제 #57

0

파일 보기

파일: linear_regression.py 프로젝트: peterstephens4/linear_regression

#  Clean Data:  Remove null value rows
loansData.dropna(inplace=True)

loansData['Interest.Rate'] = loansData['Interest.Rate'].map(lambda x: float(x.rstrip('%')))
loansData['Loan.Length']   = loansData['Loan.Length'].map(lambda x: int(x.rstrip('months')))
loansData['FICO.Score']    = loansData['FICO.Range'].map(lambda x: int(x.split('-')[0]))

#  Create Histogram of FICO scores 
plt.figure()
a = loansData['FICO.Score'].hist()
plt.savefig("Bar_Plot_FICO_Score.png")

#  Create Scatter Matrix of loan data
plt.figure()
a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist')
plt.savefig("Scatter_Matrix_Loan_Data.png")

#  Create Scatter Plot of loan data (FICO vs Interest Rate)
plt.figure()
a = loansData.plot.scatter(x = 'FICO.Score', y = 'Interest.Rate')
plt.savefig("Scatter_Plot_Loan_Data.png")

# The dependent variable
y = np.matrix(loansData['Interest.Rate']).transpose()

# The independent variables shaped as columns
x1 = np.matrix(loansData['FICO.Score']).transpose()
x2 = np.matrix(loansData['Amount.Requested']).transpose()
x = np.column_stack([x1,x2])

예제 #58

0

파일 보기

파일: c5_visualize_data.py 프로젝트: anuj3918/machine_learning

9. class = Class variable (0 or 1) 
'''

names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
dataframe = pd.read_csv(url, names=names)
print type(dataframe)

# df_head = dataframe.head()
# print df_head

# df_shape = dataframe.shape
# print df_shape

# df_dtypes = dataframe.dtypes
# print df_dtypes

# df_describe = dataframe.describe()
# print df_describe

# df_correlation = dataframe.corr()
# print df_correlation

plt.figure()
# dataframe.plot.hist(by='age')
# dataframe['age'].plot.hist()
# dataframe.plot.box(by='age')
# dataframe.plot(kind='box')
pd.scatter_matrix(dataframe)
plt.show()

예제 #59

0

파일 보기

파일: knn.py 프로젝트: btrani/projects

#Calculate average sale price by zip code as proxy for zip code
avg_by_zip = df.groupby(['ZIP CODE'])['SALE PRICE'].median().reset_index()
avg_by_zip.columns = ['ZIP CODE', 'avg_sale_by_zip']
df = pd.merge(df, avg_by_zip, on='ZIP CODE', how='outer')

#Transform sale price using log normal function to normalize data
def log(x):
    return math.log(x)

df['log_sale'] = df['SALE PRICE'].apply(log)
df['log_avg_sale'] = df['avg_sale_by_zip'].apply(log)
df['gsf_log'] = df['GROSS SQUARE FEET'].apply(log)

#Investigate potential relationships via scatter matrix
a = pd.scatter_matrix(df, figsize = (10,10), diagonal='hist')

#Split into train and test data sets
labels = df['log_sale']
df_clean = df[['TOTAL UNITS', 'avg_sale_by_zip', 'GROSS SQUARE FEET']]

X_train, X_test, y_train, y_test = train_test_split(df_clean, labels, \
test_size=0.2, random_state=0)

#Prep independent and dependent variables for regression
y = np.matrix(y_train).transpose()

#Fit the OLS model
X = sm.add_constant(X_train)
model = sm.OLS(y, X_train)
fitted = model.fit()

예제 #60

0

파일 보기

파일: knn.py 프로젝트: kkakade6/machine_learning

from sklearn.pipeline import Pipeline  #imputing within a pipeline
from sklearn.svm import SVC  #support vector classification

plt.style.use('ggplot')

iris = datasets.load_iris()
type(iris)
print(iris.keys())
type(iris.data), type(iris.target)
iris.data.shape
iris.target_names
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
print(df.head())
_ = pd.scatter_matrix(df, c=y, figsize=[8, 8], s=150, marker='D')

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X, y)
y_pred = knn.predict(X)
new_prediction = knn.predict(X)
print("Prediction: {}".format(new_prediction))
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
confusion_matrix(y_test, y_pred)
classification_report(y_test, y_pred)