def loadSDSSdata(folder='/Users/sammy/Google Drive/MachineLearning/AstroSDSS/', filename="qso10000.csv",
                 plot=False):
    """
    Load SDSS QSO data. The redshift range is rather broard from about 0.3 to 6.
    """
    filename = folder + filename
    qsos = pd.read_csv(filename,index_col=0, usecols=["objid","dered_r","spec_z","u_g_color",
                                                      "g_r_color","r_i_color","i_z_color","diff_u",
                                                      "diff_g1","diff_i","diff_z"])

    qsos = qsos[(qsos["dered_r"] > -9999) & (qsos["g_r_color"] > -10) & (qsos["g_r_color"] < 10)]
    qso_features = copy.copy(qsos)
    qso_redshifts = qsos["spec_z"]
    del qso_features["spec_z"]

    if plot:
        ## truncate the color at z=2.5 just to keep some contrast.
        norm = mpl.colors.Normalize(vmin=min(qso_redshifts.values), vmax=2.5)
        cmap = cm.jet_r
        m = cm.ScalarMappable(norm=norm, cmap=cmap)
        pd.scatter_matrix(qso_features[0:2000], alpha=0.2, figsize=[15, 15],
                          color=m.to_rgba(qso_redshifts.values))
        plt.savefig('Sample.pdf')
        plt.close()

    X_train, X_test, y_train, y_test = train_test_split(qso_features.values, qso_redshifts.values,
                                                        random_state=42)

    print "feature vector shape=", qso_features.values.shape
    print 'Training sample shape=', X_train.shape
    print 'Testing sample shape=', X_test.shape

    return X_train, X_test, y_train, y_test
def feature_m(df_all):
    df_X = df_all[['upgraded_HD', 
                   'upgraded_cpu', 
                   'upgraded_memory', 
                   'apple_care',
                   'year',
                   'px', 
                   'cpu_speed',
                   'image_url_ct',
                   'memory',
                   'HD_size']]

    df_X['apple_care'] = binarize_boolean_series(df_X['apple_care'])
    df_X['upgraded_HD'] = binarize_boolean_series(df_X['upgraded_HD'])
    df_X['upgraded_memory'] = binarize_boolean_series(df_X['upgraded_memory'])
    df_X['upgraded_cpu'] = binarize_boolean_series(df_X['upgraded_cpu'])        
    df_X['year'] = df_X['year'].astype(int)
    df_X['px'] = df_X['px'].astype(int)
    df_X['cpu_speed'] = df_X['cpu_speed'].astype(float)
    df_X['HD_size'] = df_X['HD_size'].astype(float)
    df_X['memory'] = df_X['memory'].astype(int)

    pd.scatter_matrix(df_X, figsize=(15,15));
    y = df_X.pop('year').ravel()
    X = np.array(df_X)
    return X, y
    def test_scatter_plot_legacy(self):
        df = pd.DataFrame(randn(100, 2))

        with tm.assert_produces_warning(FutureWarning):
            plotting.scatter_matrix(df)

        with tm.assert_produces_warning(FutureWarning):
            pd.scatter_matrix(df)
예제 #4
0
    def _scatterMatrixAct(self):
        df = self.getNumberDataFrame()
        if df is None: return

        DyMatplotlib.newFig()

        pd.scatter_matrix(df)
        plt.gcf().show()
def show_scatter(data, col):

    '''
    shows a scatter matrix of the data
    '''

    if col:
        pd.scatter_matrix(data[col], figsize=(10, 10))
    else:
        pd.scatter_matrix(data, figsize=(10, 10))
예제 #6
0
def slide_13():
    macro = pd.read_csv(MACRODATAPATH)
    data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
    trans_data = np.log(data).diff().dropna()
    print trans_data[-5:]

    plt.scatter(trans_data['m1'], trans_data['unemp'])
    plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))

    pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)
def scatter_matrix_topp(sorted_frame, selected_axes, percentile=1):
    """

    Arguments:
    - `sorted_frame`:
    - `selected_axes`: the axes to include in .the scatterplot matrix
    - `percentile`:
    """
    pd.scatter_matrix(
        np.log(sorted_frame[selected_axes]+1)[:(percentile*len(sorted_frame)/100)]
        )
예제 #8
0
def scoreCorrelations(preds):
    figs=[]
    for p in preds:
        pred=preds[p]
        df=pred.data
        x = df.pivot_table(index='peptide', columns='allele', values=pred.scorekey)
        f=plt.figure()
        ax=f.add_subplot(111)
        pd.scatter_matrix(x, alpha=0.2, figsize=(12,12), diagonal='hist',ax=ax)
        #plt.tight_layout()
        figs.append(f)
    return figs
예제 #9
0
def plot_scatter_matrix(title, tr, fig=None):
    if (fig is None):
        fig = plt.Figure()
    t6 = pandas.Series(tr['c'])
    t8 = pandas.Series(tr['gmm'][:,0])
    t9 = pandas.Series(tr['gmm'][:,1])
    t10 = pandas.Series(tr['gmm_p'][:,0])
    t11 = pandas.Series(tr['pbeta'])
    df = pandas.DataFrame({'cat' : t6, 'gmm_0' : t8, 'gmm_1' : t9, 'p' : t10, 'pbeta' : t11})
    pandas.scatter_matrix(df)
    plt.title(title)
    return fig
예제 #10
0
파일: Graphs.py 프로젝트: alanhdu/Dex
    def createMatrix(self, event):
        # TODO Fix ugly gridlines. sns.setStyle('nogrid') failed
        dlg = GraphDialog(self.parent, "Matrix Plot Input", ("Select Data",), 
                size=(500, 300), groups=False)

        if dlg.ShowModal() == wx.ID_OK:
            ds = [d[0] for d in dlg.GetName()]
            df = self.parent.data[ds]
            n = len(ds)
            dlg.Destroy()
                
            pd.scatter_matrix(df, grid=False)
            plt.show()
예제 #11
0
    def performScaling(self):
        self.log_data = pd.DataFrame(np.log(self.data), columns=self.data.columns)
        self.log_samples = pd.DataFrame(np.log(self.samples), columns=self.samples.columns)
        fname = "customers_log.csv"
        if not os.path.isfile(fname):
            self.log_data.to_csv(fname)
            scaler = preprocessing.StandardScaler()
            self.data_log_std = pd.DataFrame(scaler.fit_transform(self.log_data), columns=self.log_data.columns)
            self.data_log_std.to_csv("customers_log_std.csv")
        
        pd.scatter_matrix(self.log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde')
        print(self.log_samples)
#         plt.show()
        return
예제 #12
0
def plot_feature_scatter(df_feat, df_files, write_dst=''):
    '''Plot scatter matrix for all features.
    Save Exercise-labeled version of scatter plot for inspection'''
    
    # visualize features in the test set
    ax = pd.scatter_matrix(df_feat, alpha=0.2, figsize=(15, 15), diagonal='kde');

    # remove axis labels
    for axi in ax:
        for axij in axi:    
            axij.set_yticks([])
            axij.set_xticks([])

    if write_dst:
        # also create and save a version of this plot with points colored by exercise label
        df_labeled = df_feat.join(df_files.Exercise)

        g = sns.PairGrid(df_labeled, hue="Exercise")
        g.map_upper(plt.scatter, alpha=0.2)
        g.map_diag(plt.hist)
        # g.map_lower(sns.kdeplot, alpha=0.2, cmap='Greys_d')  # trouble calculating the kde

        g.add_legend()
        g.savefig(write_dst)
        plt.close() # don't create the plot here

    return ax
예제 #13
0
def make_scatter_plots(features_of_interest, df):
    '''
    This function makes bivariate scatter matrix plot for the
    inputed features of interest, which are typically the 
    individual features of the greatest importance in our 
    supervised learning classification model
    INPUTS: features_of_interest = list of strings; df =  pandas
    data frame containing song feature data
    '''
    plt.figure
    '''get mask containing songs used in our model'''
    good_mask = np.load('good_mask.npy')
    df = df[good_mask]
    contains_outliers = 'B- Var(c.t.)'
    '''
    remove outliers in the 'B- Var(c.t.)' feature to better see plots
    '''
    df = df[np.abs(df[contains_outliers]\
    - df[contains_outliers].mean()) / df[contains_outliers].std() <= 2.3 ]
    df_trunc = df[features_of_interest]
    color_dict = dict()
    '''label data points by color'''
    color_dict['tec'] = 'b'
    color_dict['hip'] = 'r'
    color_dict['cla'] = 'g'
    color_dict['roc'] = 'k'
    color_dict['pop'] = 'c'
    color_set = np.array([color_dict[name] for name in df['Label']])
    ax = pd.scatter_matrix(df_trunc, color = color_set)
    plt.xlabel([])
    plt.ylabel([])
예제 #14
0
def exploratory_viz(loansData):
    plt.figure()
    p = loansData['FICO.Score'].hist()
    plt.savefig('../figs/fico_score_hist.png')

    a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(14,14))
    plt.savefig('../figs/loan_scatter_matrix.png')
def plot_data(loansData):
    plt.figure()
    p = loansData['FICO.Score'].hist(bins=20)
    plt.show()

    a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist')
    plt.show()
예제 #16
0
def scale_features(property_data, samples):

	# Scale the data using the natural logarithm
	log_data = property_data
	log_data['Price'] = np.log(property_data['Price'])

	# Scale the sample data using the natural logarithm
	log_samples = samples
	log_samples['Price'] = np.log(samples['Price'])
	print "\nSamples after scaling:"
	display(log_samples)

	# Produce a scatter matrix for each pair of newly-transformed features
	pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14, 8), diagonal = 'kde')
	plt.show()
	return log_data, log_samples
예제 #17
0
파일: explore.py 프로젝트: bchaplin1/hazard
def visualize(data):
    # visualization
    import seaborn as sns
    import matplotlib.pyplot as plt

    # scatter matrix in Seaborn
    sns.pairplot(data)

    # scatter matrix in Pandas
    pd.scatter_matrix(data, figsize=(12, 10))

    # Use a **correlation matrix** to visualize the correlation between all numerical variables.

    # compute correlation matrix
    data.corr()

    # display correlation matrix in Seaborn using a heatmap
    sns.heatmap(data.corr())
예제 #18
0
def openFile(filename):
	
	df_genes = pd.read_csv(filename)
	df_genes2 = df_genes[['A', 'C', 'D', 'B']]
	
	# print df_genes2.head()
	# plt.show()
	gene_scatter = pd.scatter_matrix(df_genes2)
	gene_scatter
	plt.show()
def colored_scatter_matrix(data, colors, title, save=None):
    """ Scatter matrix with parametrized colors (e.g. classes) """
    print 'Plot scatter matrix...'
    fig, ax = plt.subplots(figsize=(12.0, 7.5))
    pd.scatter_matrix(
        data,
        diagonal='kde',
        figsize=(10, 10),
        ax=ax,
        c=colors,
        cmap=None
    )
    ax.set_title(title)
    if save:
        fig.savefig(save)
    else:
        mng = plt.get_current_fig_manager()
        mng.window.showMaximized()
        plt.show()
예제 #20
0
def get_iris_dataset():
    iris_dataset = load_iris()

    #1. The format of the dataset
    print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))
    print("Target names: {}".format(iris_dataset['target_names']))
    print("Feature names: \n{}".format(iris_dataset['feature_names']))
    # data -> numpy.ndarray
    # row -> the labels
    # column -> the features
    print("Type of data: {}".format(iris_dataset['data'].shape))     # (150,4)
    print("Type of target: {}".format(iris_dataset['target'].shape)) # (150,)

    #import pdb; pdb.set_trace()

    #2. split the dataset into training set and testing set
    # y = f(X)
    X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'],test_size=0.2, random_state=0)
    print("X_train shape: {}".format(X_train.shape))
    print("y_train shape: {}".format(y_train.shape))

    print("X_test shape: {}".format(X_test.shape))
    print("y_test shape: {}".format(y_test.shape))


    #import pdb; pdb.set_trace()

    # 3. inspect the data - virtualize it
    # convert Numpy array int oa pandas DataFrame
    iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)

    # pdb; pdb.set_trace()
    grr = pd.scatter_matrix(iris_dataframe, c=y_train, figsize=(15,15), marker='o', hist_kwds={'bins':20}, s=60, alpha=.8, cmap=mglearn.cm3)
    plt.show()


    #import pdb; pdb.set_trace()

    # The modelu
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=1)
    # build the model on the training set
    knn.fit(X_train, y_train)

    # the prediction
    X_new = np.array([[5, 2.9, 1, 0.2]])
    prediction = knn.predict(X_new)
    print("Prediction: {}".format(prediction))
    print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

    y_pred = knn.predict(X_test)
    print("Test set predictions:\n {}".format(y_pred))
    print("Test set score: {:.2f}".format(np.mean(y_pred==y_test)))
def data_analysis_and_correlation(df_education, df_gdp):
    """ Analysis and Correlation education data with gdp. """
    print "[Data Analysis and Correlation of Education to GDP data] ==> Begin"
    common_countries = list(set(df_education['Country'].tolist()) & set(df_gdp['Country'].tolist()))
    gdp = []
    total_school_time = []
    men_school_time = []
    women_school_time = []
    for cntry in common_countries:
        df1 = df_education[df_education['Country'] == cntry]
        df2 = df_gdp[df_gdp['Country'] == cntry]
        if df2['GDP_'+ df1['Year'].iloc[0]].iloc[0] != '':
            total_school_time.append(int(df1['Total_School_Time'].iloc[0]))
            men_school_time.append(int(df1['Men_School_Time'].iloc[0]))
            women_school_time.append(int(df1['Women_School_Time'].iloc[0]))
            gdp.append(math.log(df2['GDP_'+ df1['Year'].iloc[0]].iloc[0]))
    df_edu_to_gdp = pd.DataFrame({'Total': total_school_time, 'Men': men_school_time, \
                                  'Women': women_school_time, 'GDP': gdp})    
    print df_edu_to_gdp.corr(), "\n"
    
    gdp_np_array = np.array(df_edu_to_gdp.GDP.tolist())
    for col in ['Women', 'Men', 'Total']:
        r_val, p_val = sp(gdp_np_array, np.array(df_edu_to_gdp[col].tolist()))
        print "Correlation of GDP against {}:".format(col)
        print "Pearsons correlation coefficient: {}".format(r_val)
        print "2-tailed p-values: {}\n".format(p_val)
        
    # Scatter matrix plot with histogram of data plots in the diagonal
    pd.scatter_matrix(df_edu_to_gdp, alpha=0.05, figsize=(10, 10), diagonal='hist')
    plt.savefig('figures/education_to_gdp/data_education_gdp_analysis.png')
    plt.clf()
#     
#         ==> Conclusion / Summary
#                    GDP       Men     Total     Women
#        GDP    1.000000  0.495794  0.479050  0.497923
#        Men    0.495794  1.000000  0.971663  0.942572
#        Total  0.479050  0.971663  1.000000  0.977217
#        Women  0.497923  0.942572  0.977217  1.000000
#       
    print """
예제 #22
0
def scattermatrix(tables):
    fig = plot.figure(frameon=False,facecolor='white')
    index=common_index(tables)
    data=pd.DataFrame(index=index)
    for i in tables:
        data[i[0]]=i[1].ix[index]['MEDIAN']
    axs=pd.scatter_matrix(data, alpha=0.2, figsize=(8,8), diagonal='none', marker='.',)
    
    for ax in axs[:,0]:
        ax.grid('off', axis='both')
        ax.set_ylabel(wrap(ax.get_ylabel()), rotation=0, va='center', labelpad=30)
        ax.set_yticks([])
    for ax in axs[-1,:]:
        ax.grid('off', axis='both')
        ax.set_xlabel(wrap(ax.get_xlabel()), rotation=90)
        ax.set_xticks([])
    return fig        
예제 #23
0
    def _doplot(self, data, ax, kind, subplots, kwargs):
        """Do core plotting"""

        cols = data.columns
        rows = int(round(np.sqrt(len(data.columns)),0))
        if len(data.columns) == 1:
            kwargs['subplots'] = 0
        if kind == 'pie':
            kwargs['subplots'] = True
        if subplots == 0:
            layout = None
        else:
            layout=(rows,-1)
        if kind == 'bar':
            if len(data) > 50:
                ax.get_xaxis().set_visible(False)
            if len(data) > 400:
                print ('too many bars to plot')
                return
        if kind == 'scatter':
            axs = self.scatter(data, ax, **kwargs)
            if kwargs['sharey'] == 1:
                lims = self.fig.axes[0].get_ylim()
                for a in self.fig.axes:
                    a.set_ylim(lims)
        elif kind == 'boxplot':
            axs = data.boxplot(ax=ax, rot=kwargs['rot'], grid=kwargs['grid'])
            #boxplot won't accept required kwargs?
            if kwargs['logy'] == 1:
                ax.set_yscale('log')
        elif kind == 'histogram':
            bins = int(kwargs['bins'])
            axs = data.plot(kind='hist',layout=layout, ax=ax, **kwargs)
        elif kind == 'heatmap':
            axs = self.heatmap(data, ax, kwargs)
        elif kind == 'bootstrap':
            axs = plotting.bootstrap_plot(data)
        elif kind == 'scatter_matrix':
            axs = pd.scatter_matrix(data, ax=ax, **kwargs)
        elif kind == 'hexbin':
            x = cols[0]
            y = cols[1]
            axs = data.plot(x,y,ax=ax,kind='hexbin',gridsize=20,**kwargs)
        else:
            axs = data.plot(ax=ax, layout=layout, **kwargs)
        return axs
예제 #24
0
def realiseData():
    data = pd.read_csv(csvPath)
    # data.plot(kind='density', subplots=True, layout=(3, 3), sharex=False)
    pd.scatter_matrix(data)
    plt.show()
예제 #25
0
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
"""
DBSCN
    describe:
       核心对象: 某个点的密度达到阈值(minPts)则为核心点
       邻域阈值(r)

       传销算法


"""

colors = np.array(['red', 'green', 'blue', 'yellow'])

# 读取数据
beer = pd.read_csv('./data/data.txt', sep=' ')
X = beer[["calories", "sodium", "alcohol", "cost"]]

# dbscan
db = DBSCAN(eps=10, min_samples=2).fit(X)

beer['cluster_db'] = db.labels_
beer.groupby('cluster_db').mean()

pd.scatter_matrix(X, c=colors[beer.cluster_db], figsize=(10, 10), s=100)
plt.show()
예제 #26
0
df.groupby('species').agg(np.mean)
df.groupby('species').agg([np.min, np.max])
df.groupby('species').describe()

# explore data by sorting, looking for differences between species
df.sort_index(by='sepal_length').values
df.sort_index(by='sepal_width').values
df.sort_index(by='petal_length').values
df.sort_index(by='petal_width').values

# explore data visually, looking for differences between species
df.petal_width.hist(by=species, sharex=True)
df.boxplot(column='petal_width', by='species')
df.boxplot(by='species')
df.plot(x='petal_length', y='petal_width', kind='scatter', c=iris.target)
pd.scatter_matrix(df, c=iris.target)

## PART 2: Write a function to predict the species for each observation

# create a dictionary so we can reference columns by name
col_ix = {col: index for index, col in enumerate(df.columns)}


# define function that takes in a row of data and returns a predicted species
def classify_iris(data):
    if data[col_ix['petal_length']] < 3:
        return 'setosa'
    elif data[col_ix['petal_width']] < 1.8:
        return 'versicolor'
    else:
        return 'virginica'
scaled_df.describe()


# In[21]:


# Correlation matrix
scaled_df.corr()


# In[22]:


# Correlation plots
pd.scatter_matrix(scaled_df, figsize=(22,22))
plt.show()


# In[23]:


# Correlation heatmap
sns.set(rc={'figure.figsize':(80,10)})

corr = scaled_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
예제 #28
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

wine = pd.read_csv('/Users/Shared/py/winequality-red.csv', sep=';')

clf = linear_model.LinearRegression()

X = wine.drop(['quality'], axis=1)

Y = wine['quality']

clf.fit(X, Y)

print(clf.coef_)
print(clf.intercept_)

print(
    pd.DataFrame({
        "Name": X.columns,
        "Coefficients": clf.coef_
    }).sort_values(by='Coefficients'))
plt.matshow(wine.corr())
pd.scatter_matrix(wine)
plt.scatter(X, Y)
예제 #29
0
lookup_fruit_name = dict(
    zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))
lookup_fruit_name

#split the data in test and traing with the target variable fruit_label,random_state like seed in R
X = fruits[['height', 'width', 'mass', 'color_score']]
y = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#visualize data as pairs scatterplot of al independent variable relation with target
from matplotlib import cm
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X_train,
                            c=y_train,
                            marker='o',
                            s=40,
                            hist_kwds={'bins': 15},
                            figsize=(9, 9),
                            cmap=cmap)

#visualize in 3d
# plotting a 3D scatter plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_train['width'],
           X_train['height'],
           X_train['color_score'],
           c=y_train,
           marker='o',
예제 #30
0
ufo_cols = ufo.columns.tolist()
ufo_cols = [names.replace(' ', '_') for names in ufo.columns.tolist()]
ufo_cols2 = [names.replace(' ', '_') for names in ufo.columns]
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns = ufo_cols
# ufo.Location = ufo.City + ', ' + ufo.State
ufo['Location'] = ufo.City + ', ' + ufo.State

users = pd.read_table('u.user', sep='|', index_col='user_id')
users.groupby('occupation').count()
users.occupation.value_counts()
users.groupby('occupation').age.mean()
users.groupby('occupation').age.agg(['min', 'max'])
users.groupby(['occupation', 'gender']).age.mean()
users.groupby(['occupation', 'gender']).age.agg(['mean', 'count'])

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)

drinks[['beer', 'wine']].sort('beer').values
drinks.plot(kind='scatter', x='beer', y='wine', alpha=.3)
plt.xlabel('Beer')
plt.ylabel('Wine')
pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8))
plt.style.use('ggplot')
drinks.continent.value_counts().plot(kind='bar')
drinks.groupby('continent').mean().plot(kind='bar', figsize=(10, 8))
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar')
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar',
                                                               stacked=True)
예제 #31
0
def scatterplot(data, title=None, color=None):
    pd.scatter_matrix(data, alpha=0.3, diagonal='kde', color=color)
    if title is not None:
        plt.suptitle(title)
    plt.show()
예제 #32
0
# Plot the data (similar to before)
plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9)
"""
COMMON PROBLEMS - Multicollinearity
"""

# Now let's run a multiple linear regression
# The temp variable is no longer significant. Why? Multicollinearity
est_m = smf.ols(formula='cnt ~ atemp + temp + workingday + windspeed',
                data=bike_dat).fit()
est_m.summary()

# Scatter plot (observe the (unsurprising) correlation between atemp and temp)
cols = ['cnt', 'atemp', 'windspeed', 'weathersit', 'temp', 'workingday', 'hum']
pd.scatter_matrix(bike_dat[cols])

# Correlation coefficient matrix
corr_matrix = np.corrcoef(bike_dat[cols].T)
sm.graphics.plot_corr(corr_matrix, xnames=cols)

# Let's say we wanted to include an interaction term
# We would do this by including the ':' between interacting variables
est_m = smf.ols(formula='cnt ~ temp + windspeed + temp:windspeed + workingday',
                data=bike_dat).fit()

est_m.summary()

# An alternate way of specifying interaction terms
# a*b is equivalent to a + b + a:b
est_m = smf.ols(formula='cnt ~ temp*windspeed + workingday',
예제 #33
0
wiki_data = wiki_data.set_index('Date')
wiki_data.index = wiki_data.index.map(lambda x : parse(x))
wiki_data['changes'] = wiki_data['changes'].astype(int)



death_data = pd.read_csv('CausesOfDeath_France_2001-2008.csv')
death_data['Value'] = death_data['Value'].str.replace(' ','')
death_data['Value'] = death_data['Value'].apply(lambda x : int(re.compile(r'[^0-9]').sub('0',x)))
death_data = death_data[['ICD10','Value','SEX','TIME']]

causes = death_data.groupby('ICD10')['Value'].sum().order(ascending=False)[0:5].index.values

filtered = death_data[death_data['ICD10'].isin(causes)]

filtered_agg = filtered.groupby(['ICD10','TIME']).sum()

filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot()
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="bar")
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh")
filtered_agg.reset_index().pivot('TIME', 'ICD10','Value').plot(kind="barh", stacked=True)

cars = pd.read_csv('cars.csv',sep=';',index_col=0).drop('STRING')
cars['MPG'] = cars['MPG'].astype(float)
cars['Cylinders'] = cars['Cylinders'].astype(float)
cars['Weight'] = cars['Weight'].astype(float)
cars['Acceleration'] = cars['Acceleration'].astype(float)
cars['Horsepower'] = cars['Horsepower'].astype(float)
pd.scatter_matrix(cars, diagonal='kde', color='k', alpha=0.3)

예제 #34
0
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ml-bank').getOrCreate()
df = spark.read.csv('bank.csv', header=True, inferSchema=True)
df.printSchema()

import pandas as pd

pd.DataFrame(df.take(5), columns=df.columns).transpose()

numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
print(df.select(numeric_features).describe().toPandas().transpose())

numeric_data = df.select(numeric_features).toPandas()
axs = pd.scatter_matrix(numeric_data, figsize=(8, 8))
n = len(numeric_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n - 1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

df = df.select('age', 'job', 'marital', 'education', 'default', 'balance',
               'housing', 'loan', 'contact', 'duration', 'campaign', 'pdays',
               'previous', 'poutcome', 'deposit')
cols = df.columns
print(df.printSchema())
예제 #35
0
dataset['quality'].unique()#3-9

dataset.head()

dataset.tail()

#To find the statistical summary
dataset.describe()

#Univariate Analysis
dataset.hist()

#Multivariate Analysis
from pandas.tools.plotting import scatter_matrix

pd.scatter_matrix(dataset)

#Group the dependent variable and independent variables
array=dataset.values
X=array[:,0:11]
Y=array[:,11]

#Splitting the dataset into training set and test set
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)


# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
예제 #36
0
iris.petal_width.hist(by=iris.species, sharex=True)
iris.boxplot(column='petal_width', by='species')
iris.boxplot(by='species')

# map species to a numeric value so that plots can be colored by category
iris['species_num'] = iris.species.map({
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica': 2
})
iris.plot(kind='scatter',
          x='petal_length',
          y='petal_width',
          c='species_num',
          colormap='Blues')
pd.scatter_matrix(iris, c=iris.species_num)

## TASK 4

# If petal length is less than 3, predict setosa.
# Else if petal width is less than 1.8, predict versicolor.
# Otherwise predict virginica.

## BONUS


# define function that accepts a row of data and returns a predicted species
def classify_iris(row):
    if row[2] < 3:  # petal_length
        return 0  # setosa
    elif row[3] < 1.8:  # petal_width
예제 #37
0
centers = beer.groupby("cluster3").mean().reset_index()
print(centers)

# 图形化展示聚类效果(k=3)
from pandas import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.size'] = 14
colors = np.array(['red', 'green', 'blue', 'yellow'])
plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster3"]])

plt.scatter(centers.calories,
            centers.alcohol,
            linewidths=3,
            marker='+',
            s=300,
            c='black')

plt.xlabel("Calories")
plt.ylabel("Alcohol")
plt.show()

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]],
               s=100,
               alpha=1,
               c=colors[beer["cluster3"]],
               figsize=(10, 10))
plt.suptitle("With 3 centroids initialized")
plt.show()
예제 #38
0
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

# create dataframe from data in X_train
# label the columns using the strings in iris_dataset.feature_names

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
# create a scatter matrix from the dataframe, color by y_train
grr = pd.scatter_matrix(iris_dataframe,
                        c=y_train,
                        figsize=(15, 15),
                        marker='o',
                        hist_kwds={'bins': 20},
                        s=60,
                        alpha=.8,
                        cmap=mglearn.cm3)
#pip install mglearn

#Building Your First Model: k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
#knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)

#Making Predictions
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape: {}".format(X_new.shape))
sv = df.groupby(['Survived', 'Pclass', 'Sex'])['Name'].count()
sv.unstack().plot.bar()
plt.savefig('bars_gruppen.png')


# 5. Paarplot
def make_col(x):
    """Einfärben nach Überleben"""
    if x == 0:
        return (1, 0, 0)  # rot
    else:
        return (0, 0, 1)  # blau


col = df['Survived'].apply(make_col)
pd.scatter_matrix(df, c=col, figsize=(15, 15))
plt.savefig('paarplot.png')

# 7. Datenaufbereitung
del df['Cabin']
del df['Name']

df = df.dropna()

X = df[['Pclass', 'Age']].values
y = df['Survived'].values

# 8. Modell erstellen
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42)

m = KNeighborsClassifier(n_neighbors=1)
ax1.scatter(hollywood_movies["Profitability"], hollywood_movies["Audience Rating"])
ax1.set_xlabel("Profitability")
ax1.set_ylabel("Audience Rating")
ax1.set_title("Hollywood Movies, 2017-2011")
ax2.scatter(hollywood_movies["Audience Rating"], hollywood_movies["Profitability"])
ax2.set_xlabel("Audience Rating")
ax2.set_ylabel("Profitability")
ax2.set_title("Hollywood Movies, 2017-2011")
plt.show()


## 3. Scatter matrix - profitability and critic ratings ##

normal_movies = hollywood_movies[hollywood_movies["Film"] != "Paranormal Activity"]
filtered_movies = normal_movies[["Profitability","Audience Rating"]]
pd.scatter_matrix(filtered_movies,figsize = (6,6))
plt.show()


## 4. Box plot - audience and critic ratings ##

normal_movies.boxplot(column = ["Critic Rating","Audience Rating"])

## 5. Box plot - critic vs audience ratings per year ##

normal_movies = normal_movies.sort_values("Year")
fig = plt.figure(figsize = (8,4))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sns.boxplot(data=normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year",y = "Critic Rating", ax = ax1)
sns.boxplot(data = normal_movies[pd.notnull(normal_movies["Genre"])], x = "Year", y = "Audience Rating", ax = ax2)
    '640', '', '', '', '660', '', '', '', '680', '', '', '', '700', '720', '',
    '', '', '740', '', '', '', '760', '', '', '', '780', '', '', '', '800', '',
    '', '', '820', '', '', '', '840'
])
q0 = p.set_xlabel('FICO Score')
q1 = p.set_ylabel('Interest Rate %')
q2 = p.set_title('Lending Rate Plot')

#Create a new data frame with selected columns for analysing data
loansmin = loansdata.filter([
    'Interest.Rate', 'FICO.Score', 'Loan.Length', 'Monthly.Income',
    'Amount.Requested'
],
                            axis=1)

a = pd.scatter_matrix(loansmin, alpha=0.05, figsize=(10, 10), diagonal='hist')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(10, 10), diagonal='kde')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(8, 8), diagonal='kde')
# a = pd.scatter_matrix(loansmin,alpha=0.05,figsize=(12, 12), diagonal='kde')

interest_rate = loansmin['Interest.Rate']
loan_amount = loansmin['Amount.Requested']
fico_score = loansmin['FICO.Score']

y = np.matrix(interest_rate).transpose()
x1 = np.matrix(fico_score).transpose()
x2 = np.matrix(loan_amount).transpose()

x = np.column_stack([x1, x2])

X = sm.add_constant(x)
예제 #42
0
#
#    plt.xticks(np.arange(len(frame)), values)
#    plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8)
#
## Common attributes for plot formatting
#plt.xlabel(key)
#plt.ylabel('Number of Passengers')
#plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key))
#plt.show()



# Then look at correlations
# This will also be quite problem-specific since mixture of variables are tricky
# In principle I'd like to see some joint stats
pd.scatter_matrix(data_trn, alpha=0.3, figsize=(5,6), diagonal='kde');
# In case of mixed data this really doesn't give you a good sense of relationships
# I guess you might split into continuous and categorical, but still how about the relationship between continuous and categorical?
# Note: L-shaped pairs of variables: if you sum or take the product you get stuff that is more constant or maybe linear, maybe it tells you something
# You have all kind of 'garbage' continuous with categorical or binary and 
# all combos of those

# Maybe you can try to see a pair and the class
clr = ['r', 'b', 'y', 'm', 'c', 'k']
col_i = 'SibSp'
col_j = 'Parch'
# Adding some random noise to distinguish the dots
Z = DataFrame(np.random.rand(nTrn,2), index=data_trn.index)
dxy = 0.45
for j in range(len(set(y_trn))):
    ix = y_trn==j
예제 #43
0
drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', alpha=0.3)

# same scatterplot, except point color varies by 'spirit_servings'
# note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0
drinks.plot(kind='scatter',
            x='beer_servings',
            y='wine_servings',
            c='spirit_servings',
            colormap='Blues')

# same scatterplot, except all European countries are colored red
colors = np.where(drinks.continent == 'EU', 'r', 'b')
drinks.plot(x='beer_servings', y='wine_servings', kind='scatter', c=colors)

# scatterplot matrix of all numerical columns
pd.scatter_matrix(drinks)
'''
Advanced Filtering (of rows) and Selecting (of columns)
'''

# loc: filter rows by LABEL, and select columns by LABEL
users.loc[1]  # row with label 1
users.loc[1:3]  # rows with labels 1 through 3
users.loc[1:3,
          'age':'occupation']  # rows 1-3, columns 'age' through 'occupation'
users.loc[:,
          'age':'occupation']  # all rows, columns 'age' through 'occupation'
users.loc[[1, 3], ['age',
                   'gender']]  # rows 1 and 3, columns 'age' and 'gender'

# iloc: filter rows by POSITION, and select columns by POSITION
예제 #44
0
# performance in Paris
pres[pres.dep=="PARIS"]

''' VISUALIZATION '''

pres.ump.plot(kind='hist', bins=20)
pres.ps.plot(kind='hist', bins=20)
pres.fn.plot(kind='hist', bins=20)

pres[['ump', 'ps']].sort('ump').values
pres.plot(kind='scatter', x='ps', y='ump') # fits hypothesis: higher UMP votes, lower PS votes
pres.plot(kind='scatter', x='ump', y='fn') # line not as evident; but votes may have been interchangeable

# demonstration of vote distribution relationships between binomes
pd.scatter_matrix(pres[['ump', 'ps', 'fn']], figsize=(10, 8))

pres[['ump', 'ps', 'fn']].plot(kind='hist', stacked=True)

# testing hypothesis of voters "so far on the left they come out on the (far) right"
pd.scatter_matrix(pres[['fn', 'ug1', 'ug2']], figsize=(10, 8))
# ^^ it works!

pd.scatter_matrix(pres[['fn', 'ug2', 'ug3']], figsize=(10, 8))


'''
Data source: http://data.gouv.fr

Data desc: 
    print("acc_train = {}, acc_test ={}".format(acc_train, acc_test))
    print("Confusion Matrix:\n{}\n\n {} \n".format(CML, CM))
    print("f1_train = {}, f1_test ={}".format(f1_train, f1_test))
    print("fbeta_train = {}, fbeta_test ={}".format(fb_train, fb_test))
    print("ROC_AUC_train = {}, ROC_AUC_test ={}".format(
        roc_auc_train, roc_auc_test))

############################################# Initial Visual Tests #####################################################
########## ScatterMatrixPlot ##########

if False:
    #Transformed features

    pd.scatter_matrix(biochemistry_data,
                      alpha=0.3,
                      figsize=(16, 8),
                      diagonal='kde')
    plt.show()

if False:

    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    from sklearn.decomposition import PCA

    ndims = 2
    dim_labels = []
    for i in range(1, ndims + 1):
        dim_labels.append("Dimension {}".format(i))
예제 #46
0
fileName = r'../dataSet/Auto.csv'
#if 'coerce', then invalid parsing will be set as NaN
df = pd.read_csv(fileName)
df_numeric = df.apply(pd.to_numeric, args=('coerce',))
mask = ~np.isnan(df_numeric['cylinders'].values) & ~np.isnan(df_numeric['displacement'].values)\
       & ~np.isnan(df_numeric['horsepower'].values) & ~np.isnan(df_numeric['weight'].values)\
       & ~np.isnan(df_numeric['acceleration'].values) & ~np.isnan(df_numeric['year'].values)\
       & ~np.isnan(df_numeric['origin'].values)
X_raw = df_numeric[['cylinders','displacement','horsepower','weight','acceleration','year','origin']][mask]
y = df_numeric['mpg'][mask]

X = sm.add_constant(X_raw)
est = sm.OLS(y,X).fit()
print('Exercise 9 Answer:')
print('(a) see figure 1')
pd.scatter_matrix(df, alpha=0.5)
print('(b) ')
#correlations = np.corrcoef(pd.concat([y, X_raw], axis=1), rowvar=0)
correlations = np.corrcoef(df_numeric.loc[:,'mpg':'origin'][mask], rowvar=0)
print('(c)')
print(est.summary())
print('(c) i. The null-hypersis of all the regression coefficients are zero can be reject by large F-statistic with very small P-value.')
print('(c) ii. From P-value of each predictor, all predictor has statistically significant relationship to the response except cylinders, horsepower and acceleration.')
print('(c) iii. The coefficient of year show positive relationship. And increase of 1 year gain 0.7508 increase of mpg. It\'s means cars become more fuel efficient by year.')
print('(d) see figure 2.')
plt.figure(2)
# R plot for lm object will generate 6 plots: residuals against fitted values, sqrt(|residuals|) against fitted values, Normal Q-Q plot,
#Cook's distances versus row lables, residuals against leverages, and Cook's distances against leverage. By default, the first 3 and 5 are provided
# we plot default by python   

#residuals vs fitted values
예제 #47
0

# Aufgabe 2
#
# Verschaffe Dir einen Überblick
# über die Werte der Spalten *Art* und *Status*.
print("\nArten von Schiffen:")
print(df['Art'].value_counts())
print("\nStatus von Schiffen:")
print(df['Status'].value_counts())


# Aufgabe 3
#
# Schaue nach möglichen Korrelationen.
pd.scatter_matrix(df)
plt.savefig('matrix.png')


# Aufgabe 4
#
# Plotte Länge gegen Höhe als Streudiagramm.
df.plot.scatter('Länge', 'Höhe')
plt.savefig('scatter.png')


# Aufgabe 5
#
# Einer der Einträge enthält einen **Datenfehler**.
print("\nEintrag mit Datenfehler:")
print(df.ix['HMS Hood'].transpose())
예제 #48
0
import scipy
import numpy
import matplotlib
import pandas
import sklearn

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

# head
print(dataset.head(20))

# descriptions
print(dataset.describe())

dataset.plot(kind='box',
             subplots=True,
             layout=(2, 2),
             sharex=False,
             sharey=False)
matplotlib.pyplot.show()

dataset.hist()
matplotlib.pyplot.show()

# scatter plot matrix
pandas.scatter_matrix(dataset)
matplotlib.pyplot.show()
예제 #49
0
    score = r2_score(pred, y_test)

    scores.append(score)

#calculate mean of all 1000 scores
score = np.mean(scores)
print "\nR^2 score for predicting Milk is: ", score

#OBSERVATION
#A low r^2 value indicates that it cannot be predicted with too much accuracy using all the feautres we have. However since there is a positive value, there must be some features which can predict its value to a higher accuracy and hence it fits the data. So we should keep this feature for identifying customer habits.

##################################################################################################################
''' VISUALIZATION OF FEATURE DATA'''

#viualize data with diagnol showing data distribution
pd.scatter_matrix(data, alpha=0.3, figsize=(14, 8), diagonal='kde')
#plt.show()
'''FEATURE SCALING USING LOG'''

# Scale the data using the natural logarithm
log_data = np.log(data)

# Scale the sample data using the natural logarithm
log_samples = np.log(samples)

# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha=0.3, figsize=(14, 8), diagonal='kde')
plt.show()

print "\nScaled sampled data:\n"
print log_samples
# Scatter plots
macro = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \
                    '\macrodata.txt')
macro.head()

data = macro[['cpi','m1','tbilrate','unemp']]
data.head()

data.head()
trans_data = np.log(data).diff().dropna()

plt.scatter(trans_data['m1'], trans_data['unemp'])
plt.title('Changes in log %s vs. log %s' % ('m1','unemp'))

pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)


#  Plotting map

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv(r'C:\Users\z.chen7\Downloads\Python\pyhton_for_data_science' \
                   '\ch08_Haiti.csv')
data.info()
data.head()
data.shape
data.columns

data[['INCIDENT DATE', 'LATITUDE','LONGITUDE']][:10]
colors = Bok_GmGFs['VV-VH']

plt.scatter(Bok_GmGFs['gap_fraction'], Bok_GmGFs['VH-VVnorm'], c=colors, alpha=0.3, cmap='viridis')
plt.ylabel("Normalized VH-VV Backscatter (Gamma0 dB)")
plt.xlabel("Canopy Gap Fraction")
plt.colorbar();
plt.savefig("Correlation_VH-VVNormVsGapFraction.tiff", dpi=300)
plt.savefig("Correlation_VH-VVNormVsGapFraction.pdf", dpi=300)
#plt.legend()

Bok_GmGFs2 = Bok_GmGFs.drop('Year', 1)
Bok_GmGFs2 = Bok_GmGFs2.drop(Bok_GmGFs2.columns[[0, 1]], axis=1)

Bok_GmGFs2 = pd.DataFrame(Bok_GmGFs2)
pd.scatter_matrix(Bok_GmGFs2, alpha=0.2, figsize=(10, 10), diagonal='kde')
plt.show
plt.savefig("Scatter_Gamma0_Bands_GapFraction.tiff", dpi=300)
plt.savefig("Scatter_Gamma0_Bands_GapFraction.pdf", dpi=300)


pp = sns.pairplot(data = Bok_GmGFs,
                  y_vars =['gap_fraction'],
                  x_vars = ['VH-VVnorm','VVVHratio','VV-VH'])
plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.tiff", dpi=300)
plt.savefig("GapFraction_PairPlot_meanGamma0GFstd.pdf", dpi=300)

Bok_GmGFs.describe() # ger summary statistics of each variable in Bok_GmGFs

'''
PlotID', 'SARdate', 'VHgamma0', 'VVgamma0', 'VHdb', 'VVdb', 'VV-VH',
예제 #52
0
scatter_matrix(dataset)

# Plotting Graph
plt.scatter(dataset['total_rooms'], dataset['total_bedrooms'])
plt.show()

plt.scatter()
x = np.arange(-10, 10, 0.01)
y = 0.7 * x + 5
plt.plot(x, y)
plt.show()

y1 = 0.7 * x**2 + x + 8
plt.plot(x, y1)
plt.show()

sig_y = 1 / (1 + np.power(np.e, -x))
plt.plot(x, sig_y)
plt.show()

a = np.random.randn(10)
b = np.random.randn(5, 5)

pd.scatter_matrix(dataset.loc[:, :])
pd.show_versions(as_json=False)

corr_mat = dataset.corr()
sns.heatmap(corr_mat, annot=True)

np.arange(23, 55, 2)
np.linspace(0, 100, 6)
예제 #53
0
# boxplot of beer servings by continent (shows five-number summary and outliers)
drinks.boxplot(column="beer_servings", by="continent")

# scatterplot of beer servings versus wine servings
drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", alpha=0.3)

# same scatterplot, except point color varies by 'spirit_servings'
# note: must use 'c=drinks.spirit_servings' prior to pandas 0.15.0
drinks.plot(kind="scatter", x="beer_servings", y="wine_servings", c="spirit_servings", colormap="Blues")

# same scatterplot, except all European countries are colored red
colors = np.where(drinks.continent == "EU", "r", "b")
drinks.plot(x="beer_servings", y="wine_servings", kind="scatter", c=colors)

# scatterplot matrix of all numerical columns
pd.scatter_matrix(drinks)


"""
Advanced Filtering (of rows) and Selecting (of columns)
"""

# loc: filter rows by LABEL, and select columns by LABEL
users.loc[1]  # row with label 1
users.loc[1:3]  # rows with labels 1 through 3
users.loc[1:3, "age":"occupation"]  # rows 1-3, columns 'age' through 'occupation'
users.loc[:, "age":"occupation"]  # all rows, columns 'age' through 'occupation'
users.loc[[1, 3], ["age", "gender"]]  # rows 1 and 3, columns 'age' and 'gender'

# iloc: filter rows by POSITION, and select columns by POSITION
users.iloc[0]  # row with 0th position (first row)
예제 #54
0
ForwardU = Forward1.loc[Forward1.Status=='UFA',:]
ForwardR = Forward1.loc[Forward1.Status=='RFA',:]

Correlation

#goalies 
#correlation across category
Gcor = G1617.loc[:, ['Ovrl', 'SV%', 'Supp', 'ReMin', 'HighSV%', 'PP SV%', 'FA', 'SO SV%', 'Cap Hit', 'Ginj']]
Gcor.corr()
plt.matshow(Gcor.corr())
plt.xticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue', rotation = 'vertical')
plt.yticks(range(len(Gcor.columns)), Gcor.columns, fontsize=10, color='blue')
plt.colorbar()
plt.show()

pd.scatter_matrix(Gcor, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue')
plt.show()

#correlation from one select category
Gcor2 = G1617.loc[:, ['GP', 'W', 'L', 'SA', 'SV', 'GA', 'SV%']]
Gcor2.cov()
plt.matshow(Gcor2.corr())
plt.xticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue', rotation = 'vertical')
plt.yticks(range(len(Gcor2.columns)), Gcor2.columns, fontsize=10, color='blue')
plt.colorbar()
plt.show()

pd.scatter_matrix(Gcor2, alpha= 0.4, figsize=(7, 7), s=20, marker = '.', edgecolors = 'blue')
plt.show()

#players
예제 #55
0

'''
PLOTS
'''
'''
Creates a df with only the numerical columns for a scatter matrix

RESULT: Nearly all of the independent variables follow some sort of power law distribution
'''
Numerical_df = Master_df[['Num_Adv_Event','Num_Serious',
                'Num_Other','Num_Life_Threat','Num_Hosp',
                'Num_Congen_Anom','Num_Disable','Num_Deaths',
                'Num_Male','Num_Female','AE_Per_Year','Adj_Num_AE', 'Adj_Per_Year']]

pd.scatter_matrix(Numerical_df, diagonal='kde')

'''
Correlation matrix

RESULT: Num_Adv_Event is highly correlated (>0.60 with every other column
except for Num_Congen_Anom, Num_Disable and Num_Deaths
'''
Corr_matrix = Master_df.corr()
Corr_matrix.to_csv('C:\Users\jonbryan90\Desktop\Corr_Matrix')

'''
Density plots by Innovation_Cat for the promising variabes (Num_Adv_Event, Num_Congen, Num_Disabe, Num_Deaths)
'''
Master_df.groupby('Innovation_Cat').Num_Adv_Event.plot(kind='kde',
                                                      linewidth=2.5, 
        'id', 'RR', 'C_S', 'U_U_C', 'A_D_R_R', 'a_d_i_r', 'a_d_a_r_r',
        'a_u_d_a_r_r', 'mb_s', 'mb_e', 'mb_sub', 'mb_esec', 'mb_inp',
        'mb_insec', 'mb_uneng', 'mb_idles'
    ]
    ax.set_xticklabels(labels, fontsize=10)
    ax.set_yticklabels(labels, fontsize=6)
    ax.matshow(corr)

    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)


plot_corr(input, 15)

from pandas import scatter_matrix
scatter_matrix(input, diagonal='kde')

san = input.corr()
corr = pd.DataFrame(san)
#plotting categorical variables

san = input.day
san.value_counts().plot(kind='bar')

#looking for unique domains:
len(set(input.from_domain_hash))

#sendex approach

# anova test for weekly data
from statsmodels.formula.api import ols
#  Clean Data:  Remove null value rows
loansData.dropna(inplace=True)

loansData['Interest.Rate'] = loansData['Interest.Rate'].map(lambda x: float(x.rstrip('%')))
loansData['Loan.Length']   = loansData['Loan.Length'].map(lambda x: int(x.rstrip('months')))
loansData['FICO.Score']    = loansData['FICO.Range'].map(lambda x: int(x.split('-')[0]))

#  Create Histogram of FICO scores 
plt.figure()
a = loansData['FICO.Score'].hist()
plt.savefig("Bar_Plot_FICO_Score.png")

#  Create Scatter Matrix of loan data
plt.figure()
a = pd.scatter_matrix(loansData, alpha=0.05, figsize=(10,10), diagonal='hist')
plt.savefig("Scatter_Matrix_Loan_Data.png")

#  Create Scatter Plot of loan data (FICO vs Interest Rate)
plt.figure()
a = loansData.plot.scatter(x = 'FICO.Score', y = 'Interest.Rate')
plt.savefig("Scatter_Plot_Loan_Data.png")

# The dependent variable
y = np.matrix(loansData['Interest.Rate']).transpose()

# The independent variables shaped as columns
x1 = np.matrix(loansData['FICO.Score']).transpose()
x2 = np.matrix(loansData['Amount.Requested']).transpose()
x = np.column_stack([x1,x2])
예제 #58
0
9. class = Class variable (0 or 1) 
'''

names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
dataframe = pd.read_csv(url, names=names)
print type(dataframe)

# df_head = dataframe.head()
# print df_head

# df_shape = dataframe.shape
# print df_shape

# df_dtypes = dataframe.dtypes
# print df_dtypes

# df_describe = dataframe.describe()
# print df_describe

# df_correlation = dataframe.corr()
# print df_correlation

plt.figure()
# dataframe.plot.hist(by='age')
# dataframe['age'].plot.hist()
# dataframe.plot.box(by='age')
# dataframe.plot(kind='box')
pd.scatter_matrix(dataframe)
plt.show()
예제 #59
0
파일: knn.py 프로젝트: btrani/projects
#Calculate average sale price by zip code as proxy for zip code
avg_by_zip = df.groupby(['ZIP CODE'])['SALE PRICE'].median().reset_index()
avg_by_zip.columns = ['ZIP CODE', 'avg_sale_by_zip']
df = pd.merge(df, avg_by_zip, on='ZIP CODE', how='outer')

#Transform sale price using log normal function to normalize data
def log(x):
    return math.log(x)

df['log_sale'] = df['SALE PRICE'].apply(log)
df['log_avg_sale'] = df['avg_sale_by_zip'].apply(log)
df['gsf_log'] = df['GROSS SQUARE FEET'].apply(log)

#Investigate potential relationships via scatter matrix
a = pd.scatter_matrix(df, figsize = (10,10), diagonal='hist')

#Split into train and test data sets
labels = df['log_sale']
df_clean = df[['TOTAL UNITS', 'avg_sale_by_zip', 'GROSS SQUARE FEET']]

X_train, X_test, y_train, y_test = train_test_split(df_clean, labels, \
test_size=0.2, random_state=0)

#Prep independent and dependent variables for regression
y = np.matrix(y_train).transpose()

#Fit the OLS model
X = sm.add_constant(X_train)
model = sm.OLS(y, X_train)
fitted = model.fit()
예제 #60
0
from sklearn.pipeline import Pipeline  #imputing within a pipeline
from sklearn.svm import SVC  #support vector classification

plt.style.use('ggplot')

iris = datasets.load_iris()
type(iris)
print(iris.keys())
type(iris.data), type(iris.target)
iris.data.shape
iris.target_names
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
print(df.head())
_ = pd.scatter_matrix(df, c=y, figsize=[8, 8], s=150, marker='D')

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X, y)
y_pred = knn.predict(X)
new_prediction = knn.predict(X)
print("Prediction: {}".format(new_prediction))
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
confusion_matrix(y_test, y_pred)
classification_report(y_test, y_pred)