Пример #1
0
 def setUp(self):
     path = os.path.join(curpath(), 'data/tips.csv')
     self.data = read_csv(path, sep=',')
     layer1 = rplot.Layer(self.data)
     layer2 = rplot.GeomPoint(x='total_bill', y='tip')
     layer3 = rplot.GeomPolyFit(2)
     self.layers = rplot.sequence_layers([layer1, layer2, layer3])
     self.trellis1 = rplot.TrellisGrid(['sex', 'smoker'])
     self.trellis2 = rplot.TrellisGrid(['sex', '.'])
     self.trellis3 = rplot.TrellisGrid(['.', 'smoker'])
     self.trellised1 = self.trellis1.trellis(self.layers)
     self.trellised2 = self.trellis2.trellis(self.layers)
     self.trellised3 = self.trellis3.trellis(self.layers)
Пример #2
0
def trellis_plot_density():
    """ Trellis Plot arranges data in a rectangular grid by values of certain attributes using a density plot """

    plot =rplot.RPlot(TIPS_DATA, x='total_bill', y='tip')
    plot.add(rplot.TrellisGrid(['sex', 'smoker']))
    plot.add(rplot.GeomDensity())
    plot.render(plt.gcf())
Пример #3
0
def trellis_plot_scatter_and_density2d():
    """ Trellis Plot arranges data in a rectangular grid by values of certain attributes using a scatter plot with a 2D kernel density superimposed"""

    plot =rplot.RPlot(TIPS_DATA, x='total_bill', y='tip')
    plot.add(rplot.TrellisGrid(['sex', 'smoker']))
    plot.add(rplot.GeomScatter())
    plot.add(rplot.GeomDensity2D())
    plot.render(plt.gcf())
Пример #4
0
def trellis_plot_scatter_and_polyfit():
    """ Trellis Plot arranges data in a rectangular grid by values of certain attributes using two plots: a scatter plot and a polyfit"""

    plot =rplot.RPlot(TIPS_DATA, x='total_bill', y='tip')
    plot.add(rplot.TrellisGrid(['sex', 'smoker']))
    plot.add(rplot.GeomScatter())
    plot.add(rplot.GeomPolyFit(degree=2))
    plot.render(plt.gcf())
Пример #5
0
def _test():
    result_table = 'wa_loss_country'
    conn = CONN_PARAM.getConn()
    wa_loss_country = get_pandas_data(result_table, conn)

    plt.figure()
    plot = rplot.RPlot(a, x='year', y='total_area_km2')
    plot.add(rplot.TrellisGrid(['country', '.']))
    plot.add(rplot.GeomScatter())
    plot.render(plt.gcf())
Пример #6
0
 def test_rplot3(self):
     path = os.path.join(curpath(), 'data/tips.csv')
     plt.figure()
     self.data = read_csv(path, sep=',')
     self.plot = rplot.RPlot(self.data, x='tip', y='total_bill')
     self.plot.add(rplot.TrellisGrid(['sex', '.']))
     self.plot.add(
         rplot.GeomPoint(colour=rplot.ScaleRandomColour('day'),
                         shape=rplot.ScaleShape('size')))
     self.fig = plt.gcf()
     self.plot.render(self.fig)
def cond_hists(df, plot_cols, grid_col):
    import matplotlib.pyplot as plt
    import pandas.tools.rplot as rplot
    ## Loop over the list of columns
    for col in plot_cols:
        ## Define figure
        fig = plt.figure(figsize=(14, 4))
        fig.clf()
        ax = fig.gca()
        ## Setup plot and grid and plot the data
        plot = rplot.RPlot(df, x = col, 
                                  y = '.') 
        plot.add(rplot.TrellisGrid(['.', grid_col]))
        plot.add(rplot.GeomHistogram())
        ax.set_title('Histograms of ' + col + ' conditioned by ' + grid_col + '\n')
        plot.render()
    return grid_col        
Пример #8
0
def main():
    projects_all = pd.read_csv(open('../../dataset/projects.csv', 'r'))
    outcomes = pd.read_csv(open('../../dataset/outcomes.csv', 'r'))
    projects = pd.merge(projects_all, outcomes, on='projectid')
    projects1 = pd.DataFrame(
        projects.fillna(''),
        columns=['projectid', 'primary_focus_area', 'is_exciting'])
    projects1['cat_primary_focus_area'] = pd.factorize(
        projects1.primary_focus_area)[0]
    print(projects1)
    plt.figure()
    plot = rplot.RPlot(
        projects1,
        x='cat_primary_focus_area',
    )
    plot.add(rplot.TrellisGrid(['.', 'is_exciting']))
    plot.add(rplot.GeomHistogram())
    plot.render(plt.gcf())
    pylab.show()
Пример #9
0
def azureml_main(frame1):
    # Set graphics backend
    import matplotlib
    matplotlib.use('agg')

    import pandas as pd
    import pandas.tools.rplot as rplot
    import matplotlib.pyplot as plt
    import statsmodels.api as sm

    ## Compute the residuals
    frame1['Resids'] = frame1['Heating Load'] - frame1['Scored Label Mean']

    ## Create data frames by Overall Height
    temp1 = frame1.ix[frame1['Overall Height'] == 7]
    temp2 = frame1.ix[frame1['Overall Height'] == 3.5]

    ## Create a scatter plot of residuals vs Heating Load.
    fig1 = plt.figure(1, figsize=(9, 9))
    ax = fig1.gca()
    temp1.plot(kind = 'scatter', x = 'Heating Load', \
                y = 'Resids', c = 'DarkBlue',
                alpha = 0.3, ax = ax)
    temp2.plot(kind = 'scatter', x = 'Heating Load', \
                y = 'Resids', c = 'Red', alpha = 0.3, ax = ax)
    ax.set_title('Heating load vs. model residuals')
    plt.show()
    fig1.savefig('plot1.png')

    ## Scatter plots of the residuals conditoned by
    ## several features.
    col_list = ["Wall Area", "Roof Area", "Glazing Area"]

    for col in col_list:
        ## First plot one value of Overall Height.
        fig = plt.figure(figsize=(10, 5))
        fig.clf()
        ax = fig.gca()
        plot = rplot.RPlot(temp1, x='Heating Load', y='Resids')
        plot.add(rplot.GeomScatter(alpha=0.3, colour='DarkBlue'))
        plot.add(rplot.TrellisGrid(['.', col]))
        ax.set_title(
            'Residuals by Heating Load and height = 7 conditioned on ' + col +
            '\n')
        plot.render(plt.gcf())
        fig.savefig('scater_' + col + '7' + '.png')

        ## Now plot the other value of Overall Height.
        fig = plt.figure(figsize=(10, 5))
        fig.clf()
        ax = fig.gca()
        plot = rplot.RPlot(temp2, x='Heating Load', y='Resids')
        plot.add(rplot.GeomScatter(alpha=0.3, colour='Red'))
        plot.add(rplot.TrellisGrid(['.', col]))
        ax.set_title(
            'Residuals by Heating Load and height = 3.5 conditioned on ' +
            col + '\n')
        plot.render(plt.gcf())
        fig.savefig('scater_' + col + '3.5' + '.png')

## Histograms of the residuals
    fig4 = plt.figure(figsize=(12, 6))
    fig4.clf()
    ax1 = fig4.add_subplot(1, 2, 1)
    ax2 = fig4.add_subplot(1, 2, 2)
    ax1.hist(temp1['Resids'].as_matrix(), bins=40)
    ax1.set_xlabel("Residuals for Overall Height = 3.5")
    ax1.set_ylabel("Density")
    ax1.set_title("Histogram of residuals")
    ax2.hist(temp2['Resids'].as_matrix(), bins=40)
    ax2.set_xlabel("Residuals of model")
    ax2.set_ylabel("Density")
    ax2.set_title("Residuals for Overall Height = 7")
    fig4.savefig('plot4.png')

    ## QQ Normal plot of residuals
    fig3 = plt.figure(figsize=(12, 6))
    fig3.clf()
    ax1 = fig3.add_subplot(1, 2, 1)
    ax2 = fig3.add_subplot(1, 2, 2)
    sm.qqplot(temp1['Resids'], ax=ax1)
    ax1.set_title('QQ Normal residual plot \n with Overall Height = 3.5')
    sm.qqplot(temp2['Resids'], ax=ax2)
    ax2.set_title('QQ Normal residual plot \n with Overall Height = 7')
    fig3.savefig('plot3.png')


    out_frame = pd.DataFrame({ \
      'rmse_Overall' : [rmse(frame1['Resids'])], \
      'rmse_35Height' : [rmse(temp1['Resids'])], \
      'rmse_70Height' : [rmse(temp2['Resids'])] })

    return out_frame
Пример #10
0
#note - pies can be done with pandas as of 0.14
plt.pyplot.pie(tweetdf.tweet_type.value_counts())

crosstabs = pd.crosstab(tweetdf.tweet_day, tweetdf.tweet_type)
crosstabs.plot(kind='bar', stacked=True)
df['col'].hist(bins=25)

#splom - diagonal can be 'hist'
pd.scatter_matrix(trans_data, diagonal = 'kde', color = 'k', alpha=0.3)

#trellis
tips_data = pd.read_csv('tips.csv')
import pandas.tools.rplot as rplot
plt.figure()
plot = rplot.RPlot(tips_data, x='total_bill', y='tip')
plot.add(rplot.TrellisGrid(['sex', 'smoker']))
plot.add(rplot.GeomHistogram())
plot.render(plt.gcf())

#---------------------------------------------------------------------------
# Stats - from scipy


#some useful functions
np.random.randn(4, 3)
years = range(1880,2011)

shape()
#check that series values are close to but not exactly 1
np.allclose(x, 1) 
Пример #11
0
def azureml_main(frame1):

    ## import libraries
    import matplotlib
    matplotlib.use('agg')  # Set backend

    from pandas.tools.plotting import scatter_matrix
    import pandas.tools.rplot as rplot
    import matplotlib.pyplot as plt
    import numpy as np

    ## Create a pair-wise scatter plot
    ## ref: http://matplotlib.org/users/pyplot_tutorial.html
    Azure = True

    ## If in Azure, frame1 is passed to function
    if (Azure == False):
        frame1 = eeframe
    # first figure 1,
    fig1 = plt.figure(1, figsize=(10, 10))
    # returns the current axes
    ax = fig1.gca()
    scatter_matrix(frame1, alpha=0.3, diagonal='kde', ax=ax)
    plt.show()
    if (Azure == True): fig1.savefig('scatter1.png')

    ## Create conditioned scatter plots.
    col_list = [
        "Relative Compactness", "Surface Area", "Wall Area", "Roof Area",
        'Glazing Area', "Glazing Area Distribution"
    ]

    indx = 0
    for col in col_list:
        if (frame1[col].dtype in [np.int64, np.int32, np.float64]):
            indx += 1

            fig = plt.figure(figsize=(12, 6))
            # clear the current figure with clf() and the current axes with cla()
            fig.clf()
            ax = fig.gca()
            # http://pandas.pydata.org/pandas-docs/version/0.14.1/rplot.html
            # RPlot is a flexible API for producing Trellis plots. These plots
            # allow you to arrange data in a rectangular grid by values of
            # certain attributes.
            plot = rplot.RPlot(frame1, x=col, y='Heating Load')
            plot.add(rplot.TrellisGrid(['Overall Height', 'Orientation']))
            plot.add(rplot.GeomScatter())
            plot.add(rplot.GeomPolyFit(degree=2))
            ax.set_xlabel(col)
            ax.set_ylabel('Heating Load')
            plot.render(plt.gcf())

            if (Azure == True): fig.savefig('scatter' + col + '.png')

## Histograms of features by Overall Height
    col_list = [
        "Relative Compactness", "Surface Area", "Wall Area", "Roof Area",
        'Glazing Area', "Glazing Area Distribution", "Heating Load"
    ]
    for col in col_list:
        # http://pandas.pydata.org/pandas-docs/stable/indexing.html
        # .ix supports mixed integer and label based access.
        temp7 = frame1.ix[frame1['Overall Height'] == 7, col].as_matrix()
        temp35 = frame1.ix[frame1['Overall Height'] == 3.5, col].as_matrix()
        fig = plt.figure(figsize=(12, 6))
        fig.clf()
        # http://python4mpia.github.io/plotting/advanced.html
        ax7 = fig.add_subplot(1, 2, 1)
        ax35 = fig.add_subplot(1, 2, 2)
        # http://matplotlib.org/api/pyplot_api.html#module-matplotlib.pyplot
        ax7.hist(temp7, bins=20)
        ax7.set_title('Histogram of ' + col + '\n for for Overall Height of 7')
        ax35.hist(temp35, bins=20)
        ax35.set_title('Histogram of ' + col +
                       '\n for for Overall Height of 3.5')
        if (Azure == True): fig.savefig('hists_' + col + '.png')

## Create boxplots.
    for col in col_list:
        if (frame1[col].dtype in [np.int64, np.int32, np.float64]):
            fig = plt.figure(figsize=(6, 6))
            fig.clf()
            ax = fig.gca()
            frame1[[col, 'Overall Height']].boxplot(column=[col],
                                                    ax=ax,
                                                    by=['Overall Height'])
            ax.set_xlabel('')
            if (Azure == True): fig.savefig('box_' + col + '.png')

## In Azure, the function returns the data frame
    return frame1
Пример #12
0
def azureml_main(frame1):
    ## import libraries
    import matplotlib
    matplotlib.use('agg')  # Set backend

    from pandas.tools.plotting import scatter_matrix
    import pandas.tools.rplot as rplot
    import matplotlib.pyplot as plt
    import numpy as np

    ## Create a pair-wise scatter plot
    fig1 = plt.figure(1, figsize=(10, 10))
    ax = fig1.gca()
    scatter_matrix(frame1, alpha=0.3, diagonal='kde', ax=ax)
    plt.show()
    fig1.savefig('scatter1.png')

    ## Create conditioned scatter plots.
    col_list = [
        "Relative Compactness", "Surface Area", "Wall Area",
        "Relative Compactness Sqred", "Surface Area Sqred", "Wall Area Sqred",
        "Relative Compactness 3", "Surface Area 3", "Wall Area 3", "Roof Area",
        'Glazing Area', "Glazing Area Distribution"
    ]

    indx = 0
    for col in col_list:
        if (frame1[col].dtype in [np.int64, np.int32, np.float64]):
            indx += 1

            fig = plt.figure(figsize=(12, 6))
            fig.clf()
            ax = fig.gca()
            plot = rplot.RPlot(frame1, x=col, y='Heating Load')
            plot.add(rplot.TrellisGrid(['Overall Height', 'Orientation']))
            plot.add(rplot.GeomScatter())
            plot.add(rplot.GeomPolyFit(degree=2))
            ax.set_xlabel(col)
            ax.set_ylabel('Heating Load')
            plot.render(plt.gcf())

            fig.savefig('scatter' + col + '.png')

## Histograms of Heating Load by Overall Height
    col_list = [
        "Relative Compactness", "Surface Area", "Wall Area",
        "Relative Compactness Sqred", "Surface Area Sqred", "Wall Area Sqred",
        "Relative Compactness 3", "Surface Area 3", "Wall Area 3", "Roof Area",
        'Glazing Area', "Glazing Area Distribution", "Heating Load"
    ]
    for col in col_list:
        temp7 = frame1.ix[frame1['Overall Height'] == 7, col].as_matrix()
        temp35 = frame1.ix[frame1['Overall Height'] == 3.5, col].as_matrix()
        fig = plt.figure(figsize=(12, 6))
        fig.clf()
        ax7 = fig.add_subplot(1, 2, 1)
        ax35 = fig.add_subplot(1, 2, 2)
        ax7.hist(temp7, bins=20)
        ax7.set_title('Histogram of ' + col + '\n for for Overall Height of 7')
        ax35.hist(temp35, bins=20)
        ax35.set_title('Histogram of ' + col +
                       '\n for for Overall Height of 3.5')
        fig.savefig('hists_' + col + '.png')

## Creat boxplots.
    for col in col_list:
        if (frame1[col].dtype in [np.int64, np.int32, np.float64]):
            fig = plt.figure(figsize=(6, 6))
            fig.clf()
            ax = fig.gca()
            frame1[[col, 'Overall Height']].boxplot(column=[col],
                                                    ax=ax,
                                                    by=['Overall Height'])
            ax.set_xlabel('')
            fig.savefig('box_' + col + '.png')

## Return the data frame
    return frame1
Пример #13
0
print iris_grps.describe()

plt.figure()
pd.scatter_matrix(irisdf)

# example of how sepal is less indicative of target
patterns = [
    ('sepal_length', 'sepal_width'),
    ('petal_length', 'petal_width'),
]

for pattern in patterns:
    plt.figure(figsize=(18, 6))
    plot = rplot.RPlot(irisdf, x=pattern[1], y=pattern[0])
    plot.add(rplot.TrellisGrid(['.', 'target']))
    plot.add(rplot.GeomScatter())
    print plot.render(plt.gcf())

irisdf["petal_area"] = irisdf["petal_length"] * irisdf["petal_width"]


def petal_guess(x):
    if (x < 1):
        return 0
    elif (x < 7.5):
        return 1
    else:
        return 2

Пример #14
0
if (Azure == True): fig1.savefig("scatter1.png")

# create conditioned scatter plots
col_list = [
    "Relative Compactness", "Surface Area", "Wall Area", "Roof Area",
    "Glazing Area", "Glazing Area Distribution"
]
indx = 0
for col in col_list:
    if (frame1[col].dtype in [np.int64, np.int32, np.float64]):
        indx += 1
        fig = plt.figure(figsize=(12, 16))
        fig.clf()
        ax = fig.gca()
        plot = rplot.RPlot(frame1, x=col, y="Heating Load")
        plot.add(rplot.TrellisGrid(["Overall Height", "Orientation"]))
        plot.add(rplot.GeomScatter())
        plot.add(rplot.GeomPolyFit(degree=2))
        ax.set_xlabel(col)
        ax.set_ylabel("Heating Load")
        plot.render(plt.gcf())
        if (Azure == True): fig.savefig("scatter" + col + ".png")

# create histograms
col_list.append("Heating Load")

for col in col_list:
    temp7 = frame1.ix[frame1["Overall Height"] == 7, col].as_matrix()
    temp35 = frame1.ix[frame1["Overall Height"] == 3.5, col].as_matrix()
    fig = plt.figure(figsize=(12, 6))
    fig.clf()
Пример #15
0
a = make2d(df, "linoleic", "arachidic", labeler=rmap)
a.legend(loc='upper right')

# **A nonlinear classifier could separate the north from Sardinia!**

# We use the really ugly trellis rplot interface in Pandas to do some hierarchical digging. We plot oleic against linoleic. **We can split Sardinia. We might be able to split East Liguria out but there could be significant misclassification.**

# In[49]:

import pandas.tools.rplot as rplot
dfcopy = df.copy()
dfcopy['region'] = dfcopy['region'].map(rmap)
imap = {e[0]: e[1] for e in zip(df.area.unique(), df.areastring.unique())}
#dfcopy['area']=dfcopy['area'].map(imap)
plot = rplot.RPlot(dfcopy, x='linoleic', y='oleic')
plot.add(rplot.TrellisGrid(['region', '.']))
plot.add(
    rplot.GeomPoint(size=40.0,
                    alpha=0.3,
                    colour=rplot.ScaleRandomColour('area')))

fig = plot.render()
print df.areastring.unique()

# ### YOUR TURN NOW (10 minutes)

# Plot palmitoleic against palimitic. **What can you separate?** Use the `dfcopy` dataframe.

# In[52]:

#your code here