def show_types(limit): """ Shows all the data labels and types in PyDataset :param limit:int, specifies how many of data() to show :return: void """ pydataset_labels = data()['dataset_id'] for label in pydataset_labels[:limit]: all_data = data(label) print label + ' has shape ' + str(all_data.shape[0]) + ',' + str(all_data.shape[1]) + ' types are '+ ','.join( [str(type(list(all_data[i])[0])) for i in all_data.columns.values]) print ""
def train_iris(classifier, prediction_label): classifier_base_name = 'PyDataset_iris_' iris = data('iris') X = np.array(iris)[:, :4] labels = np.array(iris)[:, 4] unique_labels = list(set(labels)) for label_i in unique_labels: y = np.array([i == label_i for i in labels]) PyDatasetSchool.basic_train((X, y), classifier_base_name + label_i, classifier, prediction_label)
# ||||||||||||||||||||||||||||||||||||||||||||||||||||||| # ||||||||||||||||||||||||||||||||||||||||||||||||||||||| # For several of the following exercises, you'll need to load several datasets # using the pydataset library. (If you get an error when trying to run the import # below, use pip to install the pydataset package.) from pydataset import data # When the instructions say to load a dataset, you can pass the name of the dataset # as a string to the data function to load the dataset. # You can also view the documentation for the data set by passing # the show_doc keyword argument. # data('mpg', show_doc=True) # view the documentation for the dataset mpg = data('mpg') # load the dataset and store it in a variable All the datasets loaded from the pydataset library will be pandas dataframes. # 1 - Copy the code from the lesson to create a dataframe full of student grades. # a - Create a column named passing_english that indicates whether # each student has a passing grade in reading. # b - Sort the english grades by the passing_english column. How are duplicates handled? # c - Sort the english grades first by passing_english and then by student name. # All the students that are failing english should be first, # and within the students that are failing english they should be # ordered alphabetically. The same should be true for the students # passing english. (Hint: you can pass a list to the .sort_values method) # d - Sort the english grades first by passing_english, and then by the actual # english grade, similar to how we did in the last step.
sns.relplot(x='x', y='y', style='dataset', hue='dataset', data=set_0) sns.pairplot(set_0) # !pip install pydataset # had problems - fixed - there were two Python installs # INSECT SPRAY SET import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from pydataset import data insect_spray = data('InsectSprays') insect_spray.head() # -- READING SHOWING DOCUMENTATION data("InsectSprays", show_doc=True) insect_spray.groupby('spray').describe() # -- BOX PLOTS --- plt.figure(figsize=(12, 10)) # this makes it larger than default sns.boxplot(data=insect_spray, y='count', x='spray') # -- Load the swiss dataset and read it's documentation. # Create visualizations to answer the following questions:
#Topic: Data Sets in Python #----------------------------- #https://github.com/iamaziz/PyDataset #libraries import numpy as np import pandas as pd import pydataset from pydataset import data data('iris') data('iris', show_doc=True)#help #better way from pydataset import data data() alldatasets = data().copy() type(alldatasets) from pydataset import data data('iris') data('marketing') data('titanic') alldatasets.head() #check for availability data('iris') data('mtcars') data('ais')
#Topic ----K Means Clustering #https://www.analyticsindiamag.com/beginners-guide-to-k-means-clustering/ import numpy as np import pandas as pd import matplotlib.pyplot as plt from pydataset import data iris = data('iris') data = iris data.head() data.columns data.dtypes data.shape #%%%K-Means Algorithm #Selecting an appropriate value for K which is the number of clusters or centroids #Selecting random centroids for each cluster #Assigning each data point to its closest centroid #Adjusting the centroid for the newly formed cluster in step 4 #Repeating step 4 and 5 till all the data points are perfectly organised within a cluster space #%%% #Dropping the 'Species' column iris_clustering = iris.drop(columns=['Species']) iris_clustering #Selecting 2 random features from the dataset for clustering #Here we choose Sepal Length @ column 0 and Petal Length @ column 2 X = iris_clustering.iloc[:, [0, 2]].values X #We only chose 2 features as we are going to plot in 2D space. The algorithm will work for any number of features. #%%Initialising K-Means With Optimum Number Of Clusters #Fitting K-Means to the dataset
import pandas as pd import numpy as np import seaborn as sns from scipy import stats from statsmodels.formula.api import ols from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score from sklearn.feature_selection import f_regression import matplotlib.pyplot as plt from pydataset import data import warnings warnings.filterwarnings('ignore') tips = data('tips') bill = tips['total_bill'] tip = tips['tip'] regr = ols('tip ~ total_bill', data=tips).fit() tips['yhat'] = regr.predict(tips['total_bill']) def plot_residuals(x, y, dataframe): sns.residplot(x, y, data=dataframe) plt.show() def regression_errors(y, yhat): sse = ((y - yhat)**2).sum() ess = ((yhat - y.mean())**2).sum() tss = sse + ess mse = mean_squared_error(y, yhat)
def plotTinatic(): #pydataset.data('titanic', show_doc=True) titanic = pydataset.data('titanic') #titanic['class'].value_counts().plot(kind='bar') titanic.groupby('survived')['class'].value_counts().plot(kind='bar') plt.show()
del df['sigla'] df # In[2]: import pydataset # In[4]: pydataset.data() # In[6]: type(pydataset.data()) # In[7]: titanic = pydataset.data('titanic') # In[10]:
from pydataset import data import numpy as np from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt pima = data("Pima.tr") print(pima) pima.plot(kind="scatter", x="skin", y="bmi") x_train, x_test, y_train, y_test = train_test_split(pima.skin, pima.bmi) plt.scatter(x_train, y_train, label="Training Data", color='r') plt.scatter(x_test, y_test, label="Testing Data", color='b') plt.legend() lr = LinearRegression() lr.fit(x_train.values.reshape(-1, 1), y_train) y_predicted = lr.predict(x_test.values.reshape(-1, 1)) plt.plot(x_test, y_predicted, color='r') plt.scatter(x_test, y_test, color='b') plt.show() a = np.array([50]) print(a.ndim) print(lr.predict(a.reshape(-1, 1)))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Jun 30 12:32:24 2019 @author: yash_j1301 """ #Multi-indexing import numpy as np import matplotlib.pyplot as plt from pydataset import data mtcars=data('mtcars') mtcars.head(10) dir(data) iris=data('iris') titanic=data('titanic') titanic.head(100) data1=mtcars.copy() data1.columns #column names data1.values #matrix of numerical values data1.index #unique rows data1.am.dtypes #data types data1[['am','cyl','mpg','hp','drat','disp','wt','qsec','vs','gear','carb']].astype('category') #converting list to category data1.dtypes #data types of the dataframe data2=data1.reset_index() #resetting the previously set indexes of the dataframe data2.index data2.iloc[0:3,0:4] #index column created
def load_data(self,dataset_name): df = data(str(dataset_name)) df.columns = df.columns.str.replace('.','_') df.columns = df.columns.str.lower() return df
# plt.savefig("bar2.pdf") # plt.show() plt.clf(); plt.close() crime[["Robbery", "Aggravated-assault", "Vehicle-Theft"]].plot(kind='box') plt.savefig("box1.pdf") plt.clf(); plt.close() crime[["Robbery", "Aggravated-assault", "Vehicle-Theft"]].plot(kind='box', vert=False, rot=30) plt.savefig("box2.pdf") plt.clf(); plt.close() plt.show() # Hair and Eye color scatter plot hec = data("HairEyeColor") X = np.unique(hec["Hair"], return_inverse=True) Y = np.unique(hec["Eye"], return_inverse=True) hec["Hair"] = X[1] hec["Eye"] = Y[1] hec.plot(kind="scatter", x="Hair", y="Eye", s=hec["Freq"]*20) plt.xticks([0,1,2,3], X[0]) plt.yticks([0,1,2,3], Y[0]) plt.savefig("HairEyeColorscatter.png") ##########Plots for Data Visualization Section########### data('Icecream').plot(kind='scatter', x='temp', y='cons') plt.savefig('Nolabels.png') msleep = data('msleep') msleep.plot(y='sleep_total', title='Mammalian Sleep Data', legend=False)
def from_uri(cls, uri: str, **kwargs) -> "PyDataSet": dataset_name = uri.split("://")[1] data = pydataset.data(dataset_name) return PyDataSet(inner_data=data, uri=uri)
dict1 = {'rollno':[1,2,3], 'name':['India','Pakistan','England'], 'captain': ['C1','C2','C3']} print(dict1) dict1['rollno'] #%% import numpy as np np.arange(1,10) import pandas as pd link1 = 'https://raw.githubusercontent.com/DUanalytics/datasets/master/csv/buyPC.csv' df1 = pd.read_csv(link1) df1.head() from pydataset import data data('mtcars') mtcars1 = data('mtcars') mtcars1.describe() mtcars1.columns #summary mtcars1.groupby(['cyl','gear']).agg({'mpg':'mean', 'gear':'size'}) pd.crosstab(mtcars1.cyl, mtcars1.gear) import matplotlib.pyplot as plt import seaborn as sns sns.heatmap(mtcars1.corr(), annot=True) mtcars1.gear.value_counts().plot(kind='bar') mtcars1.plot.scatter(x='wt', y='mpg') mtcars1.groupby(['gear','cyl']).size().unstack().plot.bar()
def load_from_pydataset(self, dataset: str): self.data_frame = pydataset.data(dataset)
#Topic:Pandas DF #----------------------------- #libraries import numpy as np #import NumPy library import pandas as pd #pandas DF are combination of panda Series.. #one column data is a Series of one datatype, DF can have multiple data types #pip install pydataset #install pydataset from pydataset import data #importing dataset mtcars = data('mtcars') #copying mtcars data to mtcars object mtcarsDF = mtcars mtcarsDF #%%describing mtcarsDF.shape #get number of rows and columns mtcarsDF.head(3) #get top 3 rows mtcarsDF.tail(4) #get bottom 4 rows mtcarsDF.describe() mtcarsDF.columns #column names mtcarsDF.dtypes #columns with their datatypes mtcarsDF.index #here index by rownames type(mtcarsDF) mtcarsDF.select_dtypes(include=['int64']) mtcarsDF.select_dtypes(exclude=['int64']) mtcarsDF.isna() mtcarsDF.notna() id(mtcarsDF) mtcars.empty mtcars.size
query = 'SELECT * FROM employees LIMIT 5 OFFSET 50' # In[5]: pd.read_sql(query, url) # In[6]: pd.read_sql('SHOW TABLES', url) # ## 1. Load the mpg dataset. Read the documentation for it, and use the data to answer these questions: # In[7]: from pydataset import data mpg = data('mpg') data('mpg', show_doc=True) # - On average, which manufacturer has the best miles per gallon? # In[8]: mpg.groupby('manufacturer').hwy.agg(['median']).nlargest(n=1, columns="median") # - How many different manufacturers are there? # In[9]: mpg.manufacturer.nunique() # - How many different models are there?
# # Do automatic or manual cars have better miles per gallon? # def simple_trans(trans): # if trans[:4] == 'auto': # return 'auto' # else: # return 'manual' # new_trans = mpg_df.trans.apply(simple_trans) # print(mpg_df.assign(trans2 = mpg_df.trans.apply(simple_trans)).groupby('trans2')['avg_mpg'].mean().idxmax()) # Load the Mammals dataset. Read the documentation for it, # and use the data to answer these questions: mammals_df = data('Mammals') data('Mammals', show_doc=True) # print(mammals_df) # # How many rows and columns are there? # mammals_df.info() # # 107 rows and 4 columns # # What are the data types? # # float and boolean # # What is the the weight of the fastest animal? # print(mammals_df.loc[mammals_df['speed'] == mammals_df['speed'].max()]) # print('-------------') # # weight is 55 kg
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import classification_report, confusion_matrix from pydataset import data import pandas as pd #pandas used for data manipulation #scaler = StandardScaler() import time #clock import os import psutil time_start = time.clock() #TITANIC DATASET #data('titanic', show_doc=True) #can predict if they survived can do males and females separetely titanictemp = data('titanic') y = pd.factorize(titanictemp['survived'])[0] clas = pd.factorize(titanictemp['class'])[0] age = pd.factorize(titanictemp['age'])[0] sex = pd.factorize(titanictemp['sex'])[0] titanic = pd.DataFrame() titanic['age'] = age titanic['class'] = clas titanic['sex'] = sex titanic['survived'] = y # 0 = survived, 1 = died dt = DecisionTreeClassifier() X = titanic.iloc[:, 0:3] y = titanic['survived']
import seaborn as sns import matplotlib.pyplot as plt import pandas as pd from pydataset import data iris = data("iris") iris = data("iris", show_doc=True) #viewing iris docs iris.columns #viewing column_names # What does the distribution of petal lengths look like? sns.distplot(iris["Petal.Length"]) # Is there a correlation between petal length and petal width? iris_corr = iris.corr() #check contingency tables petal_lengthwidth = pd.crosstab(iris_corr["Petal.Length"], iris_corr["Petal.Width"]) sns.heatmap(petal_lengthwidth, annot=False) # Would it be reasonable to predict species based on sepal width and sepal length? # Which features would be best used to predict species? sns.relplot(x="Sepal.Length", y="Sepal.Width", data=iris, hue="Species") sepals_corr = pd.crosstab(iris["Sepal.Length"], iris["Sepal.Width"], normalize=True) sepals_corr sns.heatmap(sepals_corr)
#Method 3 - quantile method Q1 = pd.Series(marks1).quantile(0.25) Q3 = pd.Series(marks1).quantile(0.75) Q1, Q3 IQR = Q3 - Q1 IQR IQR2 = stats.iqr(pd.Series(marks1)) IQR2 #outlier : < Q1 - 1.5*IQR or > Q3 + 1.5*IQR (marks1 < Q1 - 1.5 * IQR) | (marks1 > Q3 + 1.5 * IQR) #dataframe from pydataset import data data( 'Boston' ) #https://www.engineeringbigdata.com/boston-dataset-scikit-learn-machine-learning-in-python/#:~:text=The%20sklearn%20Boston%20dataset%20is,and%20descriptions%20of%20each%20column. boston = data('Boston') boston.head() boston.columns #box plot sns.boxplot(x=boston['dis']) # plot shows three points between 10 to 12, these are outliers as there are not included in the box of other observation i.e no where near the quartiles. #scatterplot boston.columns fig, ax = plt.subplots(figsize=(16, 8)) ax.scatter(boston['indus'], boston['tax']) ax.set_xlabel('Proportion of non-retail business acres per town') ax.set_ylabel('Full-value property-tax rate per $10,000')
# -*- coding: utf-8 -*- """ Created on Tue May 14 09:14:31 2019 @author: daver """ from pydataset import data faithful = data('faithful', show_doc=True) from scipy.stats import pearsonr from sklearn.metrics import mean_squared_error from sklearn import linear_model import matplotlib.pyplot as plt from math import sqrt corr = pearsonr(faithful.waiting, faithful.eruptions) reg = linear_model.LinearRegression() reg.fit(faithful[['waiting']], faithful.eruptions) #need a dataframe for x,not series y_actual = reg.predict(faithful[['waiting']]) rms = sqrt(mean_squared_error(faithful.eruptions, y_actual)) plt.scatter(faithful.waiting, faithful.eruptions, c="blue") plt.plot(faithful.waiting, y_actual, c="red") plt.title("Predicted time between eruptions") plt.ylabel("#of eruptions") plt.yticks(range(1, 7)) plt.text(50, 5.5, 'RMSE:{:.2}'.format(rms)) plt.show()
users.merge(drop_roles, how='left', indicator=True) ''' id name _merge 0 1 bob left_only 1 2 joe left_only 2 3 sally left_only 3 4 adam left_only 4 5 jane left_only 5 6 mike left_only ''' # Trying to join when foriegn keys are deleted removes the ability for # pandas to merge the dataframes. Instead, it defaults to just stacking the # elements and placing NaN values for any missing values. #5 Load mpg dataset from pydataset mpg = data('mpg') #6 output and read the documenation for the mpg dataset data('mpg', show_doc = True) # check the documentation ''' PyDataset Documentation (adopted from R Documentation. The displayed examples are in R) ## Fuel economy data from 1999 and 2008 for 38 popular models of car ### Description This dataset contains a subset of the fuel economy data that the EPA makes available on http://fueleconomy.gov. It contains only models which had a new release every year between 1999 and 2008 - this was used as a proxy for the popularity of the car.
#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import pydataset # In[2]: titanic = pydataset.data('titanic') # In[3]: titanic.columns # In[4]: titanic['class'].describe() # In[5]:
#this exercise is to practice and learn data sets, understand how they store #data, and retrieve it #dataset included in python from pydataset import data df = data('titanic') print(df)
#python : Topic :Decision Tree using mtcars #standard libaries import numpy as np import pandas as pd import matplotlib.pyplot as plt from pydataset import data import seaborn as sns #enhances graphing features df = data('mtcars') df.head() #from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn.model_selection import train_test_split from sklearn import metrics, tree df['am'].value_counts() # am = automatic df.columns #classification #predict if transmission of car is 0 or 1 on basis of mpg, hp, wt X1 = df[['mpg','hp','wt']] Y1 = df['am'] Y1.value_counts() type(Y1) type(X1) #for splitting into train and test from sklearn.model_selection import train_test_split X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=.20) X1_train.shape X1_test.shape #classification tree
from env import host, user, password import seaborn as sns import numpy as np from scipy import stats from statsmodels.formula.api import ols from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score from sklearn.feature_selection import f_regression from math import sqrt import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') from pydataset import data df = data('tips') # 2. Fit a linear regression model (ordinary least squares) and compute yhat, predictions of tip using total_bill. You may follow these steps to do that: # import the method from statsmodels: from statsmodels.formula.api import ols # fit the model to your data, where x = total_bill and y = tip: regr = ols('y ~ x', data=df).fit() # compute yhat, the predictions of tip using total_bill: df['yhat'] = regr.predict(df.x) #Descriptive df.head() df.columns.values df.shape df.describe() df.info() print(df.isnull().sum()) df.total_bill.value_counts(ascending=True)
from pydataset import data import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import matplotlib.patches as mpatches from evaluate import regression_errors from sklearn.linear_model import LinearRegression faithful = data('faithful') pearsons_r = faithful.eruptions.corr(faithful.waiting) sns.scatterplot(x='waiting', y='eruptions', data=faithful) plt.show() reg = LinearRegression() reg.fit(faithful[['waiting']], faithful[['eruptions']]) y_pred = reg.predict(faithful[['waiting']]) faithful['predicted'] = y_pred sns.scatterplot(x='waiting', y='predicted', data=faithful, alpha=0.8) sns.scatterplot(x='waiting', y='eruptions', data=faithful) plt.title('Waiting Time Between Geyser Eruptions at Old Faithful') plt.yticks(np.arange(0, 6)) rmse = regression_errors(faithful.eruptions, faithful.predicted)[4] blue_patch = mpatches.Patch(color='blue', label='Predicted') orange_patch = mpatches.Patch(color='orange', label='Actual') plt.legend(handles=[blue_patch, orange_patch]) plt.annotate(s=f'RMSE: {rmse}', xy=(72, 1))
#Muli Index #----------------------------- #% Multindex in Pandas DF import pandas as pd import numpy as np import matplotlib.pyplot as plt from pydataset import data data('iris') data('titanic') #https://github.com/iamaziz/PyDataset/blob/master/examples/sample-datasets.ipynb mtcars = data('mtcars') mtcars.head() #mtcars.to_csv('data/mcarsdataset.csv') data1 = mtcars #describe data data1.columns #col names data1.values# values of DF data1.index data1.dtypes data1[['am','vs','cyl','gear','carb']] = data1[['am','vs', 'cyl','gear', 'carb']].astype('category') #data1[['am','vs', 'cyl','gear', 'carb']].astype('category', inplace=False) data1.dtypes data1.iloc[0:3,0:4] #reset the index : index to column data2 = data1.reset_index() data2.iloc[0:3,0:4] data2.columns data2.rename({'index':'carname'},inplace=True, axis='columns') #rename index column to carname : old to new data2.head()
plt.title('Total Bill vs Tip Amount') tips.head() sns.relplot(data=tips, y='tip', x='total_bill', hue='size') sns.relplot(data=tips, y='tip', x='total_bill', hue='size', style='time') sns.relplot(data=tips, y='tip', x='total_bill', hue='size', style='time', col='smoker') sns.relplot(data=tips, y='tip', x='total_bill', style='size') get_ipython().run_line_magic('pinfo', 'sns.relplot') tips[['total_bill', 'tip']] tips[['total_bill', 'tip']].corr() from pydataset import data data('mpg') cars cars = data('mpg') cars.head() cars.corr() from matplotlib import cm sns.heatmap(cars.corr(), cmap=cm.PiYG, annot=True, center=0) sns.heatmap(cars.corr(), cmap=cm.PiYG, annot=True, center=0) cm.PiYG from matplotlib import cm sns.heatmap(cars.corr(), cmap=cm.Blues, annot=True, center=0) sns.heatmap(cars.corr(), cmap=cm.Blues, annot=True, center=0) sns.distplot(cars.hwy) sns.boxplot(data=tips, y='tip') sns.boxplot(data=tips, y='tip', x='time') sns.boxplot(data=tips, y='time', x='tip')
def get_db_url(username, hostname, password, database): url = f'mysql+pymysql://{username}:{password}@{hostname}/{database}' return url def remove_commas_and_dollarsign(string_num): """ Cleans off a starting $ and commas where-ever they are. Takes a string, returns a string """ x = string_num.replace(',','') x = x.strip("$") return x ###################### #####the main script! mpg = data('mpg') # 1 # On average, which manufacturer has the best miles per gallon? # This groups the manufacturers by their hwy mileage. mpg.groupby('manufacturer').hwy.agg(['min', 'mean', 'max']) # I'm gonna have to make a new column that is the average of city and highway mileage. mpg['mileage'] = (mpg['cty'] + mpg['hwy']) / 2 #Let's group the on this new column mpg.groupby('manufacturer').mileage.agg(['min', 'mean', 'max']) #Now sort them and see who's at the bottom mpg.groupby('manufacturer').mileage.agg(['min', 'mean', 'max']).sort_values(by='mean').tail(1) #Looks like Honda is the winner
import numpy as np import pandas as pd import matplotlib.pyplot as plt plt? #%%basic scatter plot x = [5,7,8,7,2,17,2,9,4,11,12,9,6] y = [99,86,87,88,111,86,103,87,94,78,77,85,86] plt.scatter(x, y) plt.show(); #%% #dataset from pydataset import data mtcars = data('mtcars') #conda upgrade --all -y df=mtcars #%% scatter plot #dim - x, y, shape(s), color(c), tranpsarency(alpha) df.describe #summary df.dtypes #data types df.columns df['wt']; df['mpg'] plt.scatter(x='wt', y='mpg', data=df) plt.show(); df.carb df.carb.value_counts()
# Change target to target_names & merge with main dataframe df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names) df['species'].head() #https://python-data-science.readthedocs.io/en/latest/datasets.html #%%% #not working from rpy2.robjects import r, pandas2ri def data(name): return pandas2ri.ri2py(r[name]) #%% #pip install pydataset from pydataset import data titanic = data('titanic') titanic.head() #%%% from sklearn import datasets load_boston() # Load and return the boston house-prices dataset (regression). load_iris() # Load and return the iris dataset (classification). load_diabetes() # Load and return the diabetes dataset (regression). load_digits([n_class])# Load and return the digits dataset (classification). load_linnerud() # Load and return the linnerud dataset (multivariate regression). from sklearn.datasets import load_iris iris = load_iris() #%% import seaborn as sns
def load_children(self): for row in data().itertuples(): child = PydatasetNode(row[1], row[2], self) if child: self.add_child(child)
import pandas as pd from pydataset import data from matplotlib import pyplot as plt import numpy as np %matplotlib inline # Problem 1 print "Problem 1:" nottem_data = data('nottem') VADeaths_data = data('VADeaths') Arbuthnot_data = data('Arbuthnot') #data('nottem', show_doc=True) print 'Nottem:' print 'This data set contains the monthly average air temperatures at Nottingham Castle for 20 years.' print 'Since this data set consists of a single series, a line plot is best.' nottem_data.plot(y='nottem',legend=False) plt.show() #data('VADeaths', show_doc=True) print 'VA Deaths:' print 'This data set contains death rates in Virginia in 1940, cross-classified \ by age group, gender, and urban/rural.' print 'Since this data set contains multiple categories, a bar plot is more effective.' VADeaths_data.iloc[:][['Rural Male','Rural Female','Urban Male','Urban Female']].plot(kind='bar') plt.show() #data('Arbuthnot', show_doc=True) #Arbuthnot_data.plot(subplots=True, layout=(7,1))