def show_types(limit):
        """
        Shows all the data labels and types in PyDataset

        :param limit:int, specifies how many of data() to show
        :return: void
        """
        pydataset_labels = data()['dataset_id']
        for label in pydataset_labels[:limit]:
            all_data = data(label)
            print label + ' has shape ' + str(all_data.shape[0]) + ',' + str(all_data.shape[1]) + ' types are '+ ','.join(
                [str(type(list(all_data[i])[0])) for i in all_data.columns.values])
            print ""
 def train_iris(classifier, prediction_label):
     classifier_base_name = 'PyDataset_iris_'
     iris = data('iris')
     X = np.array(iris)[:, :4]
     labels = np.array(iris)[:, 4]
     unique_labels = list(set(labels))
     for label_i in unique_labels:
         y = np.array([i == label_i for i in labels])
         PyDatasetSchool.basic_train((X, y), classifier_base_name + label_i, classifier, prediction_label)
# |||||||||||||||||||||||||||||||||||||||||||||||||||||||
# |||||||||||||||||||||||||||||||||||||||||||||||||||||||

# For several of the following exercises, you'll need to load several datasets 
# using the pydataset library. (If you get an error when trying to run the import 
# below, use pip to install the pydataset package.)

from pydataset import data

# When the instructions say to load a dataset, you can pass the name of the dataset
#  as a string to the data function to load the dataset. 
# You can also view the documentation for the data set by passing 
#      the show_doc keyword argument.

# data('mpg', show_doc=True) # view the documentation for the dataset
mpg = data('mpg') # load the dataset and store it in a variable

All the datasets loaded from the pydataset library will be pandas dataframes.

# 1 - Copy the code from the lesson to create a dataframe full of student grades.
# a - Create a column named passing_english that indicates whether 
#       each student has a passing grade in reading.

# b - Sort the english grades by the passing_english column. How are duplicates handled?
# c - Sort the english grades first by passing_english and then by student name.
#        All the students that are failing english should be first, 
#       and within the students that are failing english they should be 
#       ordered alphabetically. The same should be true for the students 
#       passing english. (Hint: you can pass a list to the .sort_values method)
# d - Sort the english grades first by passing_english, and then by the actual
#        english grade, similar to how we did in the last step.
예제 #4
0
sns.relplot(x='x', y='y', style='dataset', hue='dataset', data=set_0)

sns.pairplot(set_0)

# !pip install pydataset
# had problems - fixed - there were two Python installs

#  INSECT SPRAY SET
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

from pydataset import data
insect_spray = data('InsectSprays')

insect_spray.head()

# -- READING SHOWING DOCUMENTATION
data("InsectSprays", show_doc=True)

insect_spray.groupby('spray').describe()

#  -- BOX PLOTS ---
plt.figure(figsize=(12, 10))  # this makes it larger than default
sns.boxplot(data=insect_spray, y='count', x='spray')

# -- Load the swiss dataset and read it's documentation.
# Create visualizations to answer the following questions:
예제 #5
0
#Topic: Data Sets in Python
#-----------------------------
#https://github.com/iamaziz/PyDataset
#libraries

import numpy as np
import pandas as pd

import pydataset
from pydataset import data
data('iris')
data('iris', show_doc=True)#help

#better way 
from pydataset import data
data()
alldatasets = data().copy()
type(alldatasets)


from pydataset import data
data('iris')
data('marketing')
data('titanic')

alldatasets.head()

#check for availability
data('iris')
data('mtcars')
data('ais')
예제 #6
0
#Topic ----K Means Clustering
#https://www.analyticsindiamag.com/beginners-guide-to-k-means-clustering/
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
iris = data('iris')
data = iris
data.head()
data.columns
data.dtypes
data.shape
#%%%K-Means Algorithm
#Selecting an appropriate value for K which is the number of clusters or centroids
#Selecting random centroids for each cluster
#Assigning each data point to its closest centroid
#Adjusting the centroid for the newly formed cluster in step 4
#Repeating step 4 and 5 till all the data points are perfectly organised within a cluster space

#%%%
#Dropping the 'Species' column
iris_clustering = iris.drop(columns=['Species'])
iris_clustering
#Selecting 2 random features from the dataset for clustering
#Here we choose Sepal Length @ column 0 and Petal Length @ column 2
X = iris_clustering.iloc[:, [0, 2]].values
X
#We only chose 2 features as we are going to plot in 2D space. The algorithm will work  for any number of features.

#%%Initialising K-Means With Optimum Number Of Clusters
#Fitting K-Means to the dataset
예제 #7
0
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression
import matplotlib.pyplot as plt
from pydataset import data
import warnings
warnings.filterwarnings('ignore')

tips = data('tips')

bill = tips['total_bill']
tip = tips['tip']
regr = ols('tip ~ total_bill', data=tips).fit()
tips['yhat'] = regr.predict(tips['total_bill'])


def plot_residuals(x, y, dataframe):
    sns.residplot(x, y, data=dataframe)
    plt.show()


def regression_errors(y, yhat):
    sse = ((y - yhat)**2).sum()
    ess = ((yhat - y.mean())**2).sum()
    tss = sse + ess
    mse = mean_squared_error(y, yhat)
예제 #8
0
def plotTinatic():
    #pydataset.data('titanic', show_doc=True)
    titanic = pydataset.data('titanic')
    #titanic['class'].value_counts().plot(kind='bar')
    titanic.groupby('survived')['class'].value_counts().plot(kind='bar')
    plt.show()
예제 #9
0

del df['sigla']
df


# In[2]:


import pydataset


# In[4]:


pydataset.data()


# In[6]:


type(pydataset.data())


# In[7]:


titanic = pydataset.data('titanic')


# In[10]:
from pydataset import data
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

pima = data("Pima.tr")
print(pima)
pima.plot(kind="scatter", x="skin", y="bmi")
x_train, x_test, y_train, y_test = train_test_split(pima.skin, pima.bmi)

plt.scatter(x_train, y_train, label="Training Data", color='r')
plt.scatter(x_test, y_test, label="Testing Data", color='b')
plt.legend()

lr = LinearRegression()
lr.fit(x_train.values.reshape(-1, 1), y_train)

y_predicted = lr.predict(x_test.values.reshape(-1, 1))

plt.plot(x_test, y_predicted, color='r')
plt.scatter(x_test, y_test, color='b')
plt.show()

a = np.array([50])
print(a.ndim)
print(lr.predict(a.reshape(-1, 1)))
예제 #11
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 30 12:32:24 2019

@author: yash_j1301
"""
#Multi-indexing 
import numpy as np
import matplotlib.pyplot as plt
from pydataset import data

mtcars=data('mtcars')
mtcars.head(10)
dir(data)
iris=data('iris')
titanic=data('titanic')
titanic.head(100)

data1=mtcars.copy()
data1.columns #column names
data1.values #matrix of numerical values 
data1.index #unique rows
data1.am.dtypes #data types

data1[['am','cyl','mpg','hp','drat','disp','wt','qsec','vs','gear','carb']].astype('category') #converting list to category
data1.dtypes #data types of the dataframe

data2=data1.reset_index() #resetting the previously set indexes of the dataframe
data2.index
data2.iloc[0:3,0:4] #index column created
예제 #12
0
 def load_data(self,dataset_name):
     df = data(str(dataset_name))
     df.columns = df.columns.str.replace('.','_')
     df.columns = df.columns.str.lower()
     return df
예제 #13
0
# plt.savefig("bar2.pdf")
# plt.show()
plt.clf(); plt.close()

crime[["Robbery", "Aggravated-assault", "Vehicle-Theft"]].plot(kind='box')
plt.savefig("box1.pdf")
plt.clf(); plt.close()
crime[["Robbery", "Aggravated-assault", "Vehicle-Theft"]].plot(kind='box',
                                                        vert=False, rot=30)
plt.savefig("box2.pdf")
plt.clf(); plt.close()
plt.show()


# Hair and Eye color scatter plot
hec = data("HairEyeColor")
X = np.unique(hec["Hair"], return_inverse=True)
Y = np.unique(hec["Eye"], return_inverse=True)
hec["Hair"] = X[1]
hec["Eye"] = Y[1]
hec.plot(kind="scatter", x="Hair", y="Eye", s=hec["Freq"]*20)
plt.xticks([0,1,2,3], X[0])
plt.yticks([0,1,2,3], Y[0])
plt.savefig("HairEyeColorscatter.png")

##########Plots for Data Visualization Section###########
data('Icecream').plot(kind='scatter', x='temp', y='cons')
plt.savefig('Nolabels.png')

msleep = data('msleep')
msleep.plot(y='sleep_total', title='Mammalian Sleep Data', legend=False)
예제 #14
0
 def from_uri(cls, uri: str, **kwargs) -> "PyDataSet":
     dataset_name = uri.split("://")[1]
     data = pydataset.data(dataset_name)
     return PyDataSet(inner_data=data, uri=uri)
예제 #15
0
dict1 = {'rollno':[1,2,3], 'name':['India','Pakistan','England'], 'captain': ['C1','C2','C3']}
print(dict1)
dict1['rollno']

#%%
import numpy as np
np.arange(1,10)

import pandas as pd
link1 = 'https://raw.githubusercontent.com/DUanalytics/datasets/master/csv/buyPC.csv'
df1 = pd.read_csv(link1)
df1.head()

from pydataset import data 
data('mtcars')
mtcars1 = data('mtcars')
mtcars1.describe()
mtcars1.columns
#summary
mtcars1.groupby(['cyl','gear']).agg({'mpg':'mean', 'gear':'size'})
pd.crosstab(mtcars1.cyl, mtcars1.gear)


import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(mtcars1.corr(), annot=True)
mtcars1.gear.value_counts().plot(kind='bar')
mtcars1.plot.scatter(x='wt', y='mpg')
mtcars1.groupby(['gear','cyl']).size().unstack().plot.bar()
예제 #16
0
 def load_from_pydataset(self, dataset: str):
     self.data_frame = pydataset.data(dataset)
예제 #17
0
#Topic:Pandas DF
#-----------------------------
#libraries
import numpy as np #import NumPy library
import pandas as pd
#pandas DF are combination of panda Series..
#one column data is a Series of one datatype, DF can have multiple data types
#pip install pydataset #install pydataset
from pydataset import data #importing dataset 
mtcars = data('mtcars') #copying mtcars data to mtcars object
mtcarsDF = mtcars
mtcarsDF

#%%describing
mtcarsDF.shape #get number of rows and columns 
mtcarsDF.head(3) #get top 3 rows
mtcarsDF.tail(4) #get bottom 4 rows
mtcarsDF.describe()
mtcarsDF.columns #column names
mtcarsDF.dtypes #columns with their datatypes

mtcarsDF.index  #here index by rownames
type(mtcarsDF)

mtcarsDF.select_dtypes(include=['int64'])
mtcarsDF.select_dtypes(exclude=['int64'])
mtcarsDF.isna()
mtcarsDF.notna()
id(mtcarsDF)
mtcars.empty
mtcars.size
예제 #18
0
query = 'SELECT * FROM employees LIMIT 5 OFFSET 50'

# In[5]:

pd.read_sql(query, url)

# In[6]:

pd.read_sql('SHOW TABLES', url)

# ## 1. Load the mpg dataset. Read the documentation for it, and use the data to answer these questions:

# In[7]:

from pydataset import data
mpg = data('mpg')
data('mpg', show_doc=True)

# - On average, which manufacturer has the best miles per gallon?

# In[8]:

mpg.groupby('manufacturer').hwy.agg(['median']).nlargest(n=1, columns="median")

# - How many different manufacturers are there?

# In[9]:

mpg.manufacturer.nunique()

# - How many different models are there?
예제 #19
0
# # Do automatic or manual cars have better miles per gallon?
# def simple_trans(trans):
#     if trans[:4] == 'auto':
#         return 'auto'
#     else:
#         return 'manual'

# new_trans = mpg_df.trans.apply(simple_trans)
# print(mpg_df.assign(trans2 = mpg_df.trans.apply(simple_trans)).groupby('trans2')['avg_mpg'].mean().idxmax())



# Load the Mammals dataset. Read the documentation for it, 
# and use the data to answer these questions:
mammals_df = data('Mammals')
data('Mammals', show_doc=True)
# print(mammals_df)

# # How many rows and columns are there?
# mammals_df.info()
# # 107 rows and 4 columns

# # What are the data types?
# # float and boolean

# # What is the the weight of the fastest animal?
# print(mammals_df.loc[mammals_df['speed'] == mammals_df['speed'].max()])
# print('-------------')
# # weight is 55 kg
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from pydataset import data
import pandas as pd  #pandas used for data manipulation

#scaler = StandardScaler()

import time  #clock
import os
import psutil
time_start = time.clock()

#TITANIC DATASET
#data('titanic', show_doc=True) #can predict if they survived can do males and females separetely
titanictemp = data('titanic')
y = pd.factorize(titanictemp['survived'])[0]
clas = pd.factorize(titanictemp['class'])[0]
age = pd.factorize(titanictemp['age'])[0]
sex = pd.factorize(titanictemp['sex'])[0]

titanic = pd.DataFrame()
titanic['age'] = age
titanic['class'] = clas
titanic['sex'] = sex
titanic['survived'] = y  # 0 = survived, 1 = died

dt = DecisionTreeClassifier()

X = titanic.iloc[:, 0:3]
y = titanic['survived']
예제 #21
0
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pydataset import data

iris = data("iris")

iris = data("iris", show_doc=True)  #viewing iris docs
iris.columns  #viewing column_names

# What does the distribution of petal lengths look like?
sns.distplot(iris["Petal.Length"])

# Is there a correlation between petal length and petal width?
iris_corr = iris.corr()  #check contingency tables
petal_lengthwidth = pd.crosstab(iris_corr["Petal.Length"],
                                iris_corr["Petal.Width"])

sns.heatmap(petal_lengthwidth, annot=False)

# Would it be reasonable to predict species based on sepal width and sepal length?
# Which features would be best used to predict species?

sns.relplot(x="Sepal.Length", y="Sepal.Width", data=iris, hue="Species")

sepals_corr = pd.crosstab(iris["Sepal.Length"],
                          iris["Sepal.Width"],
                          normalize=True)
sepals_corr
sns.heatmap(sepals_corr)
예제 #22
0
#Method 3 - quantile method
Q1 = pd.Series(marks1).quantile(0.25)
Q3 = pd.Series(marks1).quantile(0.75)
Q1, Q3
IQR = Q3 - Q1
IQR
IQR2 = stats.iqr(pd.Series(marks1))
IQR2

#outlier : < Q1 - 1.5*IQR or > Q3 + 1.5*IQR
(marks1 < Q1 - 1.5 * IQR) | (marks1 > Q3 + 1.5 * IQR)

#dataframe
from pydataset import data
data(
    'Boston'
)  #https://www.engineeringbigdata.com/boston-dataset-scikit-learn-machine-learning-in-python/#:~:text=The%20sklearn%20Boston%20dataset%20is,and%20descriptions%20of%20each%20column.
boston = data('Boston')

boston.head()
boston.columns
#box plot
sns.boxplot(x=boston['dis'])
# plot shows three points between 10 to 12, these are outliers as there are not included in the box of other observation i.e no where near the quartiles.

#scatterplot
boston.columns
fig, ax = plt.subplots(figsize=(16, 8))
ax.scatter(boston['indus'], boston['tax'])
ax.set_xlabel('Proportion of non-retail business acres per town')
ax.set_ylabel('Full-value property-tax rate per $10,000')
예제 #23
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 14 09:14:31 2019

@author: daver
"""

from pydataset import data
faithful = data('faithful', show_doc=True)
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import matplotlib.pyplot as plt
from math import sqrt

corr = pearsonr(faithful.waiting, faithful.eruptions)
reg = linear_model.LinearRegression()
reg.fit(faithful[['waiting']],
        faithful.eruptions)  #need a dataframe for x,not series
y_actual = reg.predict(faithful[['waiting']])
rms = sqrt(mean_squared_error(faithful.eruptions, y_actual))

plt.scatter(faithful.waiting, faithful.eruptions, c="blue")
plt.plot(faithful.waiting, y_actual, c="red")
plt.title("Predicted time between eruptions")
plt.ylabel("#of eruptions")
plt.yticks(range(1, 7))
plt.text(50, 5.5, 'RMSE:{:.2}'.format(rms))
plt.show()
예제 #24
0
users.merge(drop_roles, how='left', indicator=True)
'''
id	name	_merge
0	1	bob	left_only
1	2	joe	left_only
2	3	sally	left_only
3	4	adam	left_only
4	5	jane	left_only
5	6	mike	left_only
'''
    # Trying to join when foriegn keys are deleted removes the ability for 
    # pandas to merge the dataframes. Instead, it defaults to just stacking the
    # elements and placing NaN values for any missing values.

#5 Load mpg dataset from pydataset
mpg = data('mpg')

#6 output and read the documenation for the mpg dataset
data('mpg', show_doc = True) # check the documentation

'''
PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import pydataset


# In[2]:


titanic = pydataset.data('titanic')


# In[3]:


titanic.columns


# In[4]:


titanic['class'].describe()


# In[5]:

예제 #26
0
#this exercise is to practice and learn data sets, understand how they store
#data, and retrieve it

#dataset included in python

from pydataset import data

df = data('titanic')

print(df)
예제 #27
0
#python : Topic :Decision Tree using mtcars

#standard libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import seaborn as sns     #enhances graphing features
df = data('mtcars')
df.head()
#from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import metrics, tree
df['am'].value_counts()   # am = automatic
df.columns

#classification
#predict if transmission of car is 0 or 1 on basis of mpg, hp, wt
X1 = df[['mpg','hp','wt']]
Y1 = df['am']
Y1.value_counts()
type(Y1)
type(X1)

#for splitting into train and test 
from sklearn.model_selection import train_test_split
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=.20)
X1_train.shape
X1_test.shape

#classification tree
예제 #28
0
from env import host, user, password
import seaborn as sns
import numpy as np
from scipy import stats
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression
from math import sqrt
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from pydataset import data

df = data('tips')

# 2. Fit a linear regression model (ordinary least squares) and compute yhat, predictions of tip using total_bill. You may follow these steps to do that:

# import the method from statsmodels: from statsmodels.formula.api import ols
# fit the model to your data, where x = total_bill and y = tip: regr = ols('y ~ x', data=df).fit()
# compute yhat, the predictions of tip using total_bill: df['yhat'] = regr.predict(df.x)

#Descriptive
df.head()
df.columns.values
df.shape
df.describe()
df.info()
print(df.isnull().sum())
df.total_bill.value_counts(ascending=True)
예제 #29
0
from pydataset import data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from evaluate import regression_errors
from sklearn.linear_model import LinearRegression

faithful = data('faithful')

pearsons_r = faithful.eruptions.corr(faithful.waiting)

sns.scatterplot(x='waiting', y='eruptions', data=faithful)
plt.show()

reg = LinearRegression()
reg.fit(faithful[['waiting']], faithful[['eruptions']])

y_pred = reg.predict(faithful[['waiting']])
faithful['predicted'] = y_pred

sns.scatterplot(x='waiting', y='predicted', data=faithful, alpha=0.8)
sns.scatterplot(x='waiting', y='eruptions', data=faithful)
plt.title('Waiting Time Between Geyser Eruptions at Old Faithful')
plt.yticks(np.arange(0, 6))
rmse = regression_errors(faithful.eruptions, faithful.predicted)[4]
blue_patch = mpatches.Patch(color='blue', label='Predicted')
orange_patch = mpatches.Patch(color='orange', label='Actual')
plt.legend(handles=[blue_patch, orange_patch])
plt.annotate(s=f'RMSE: {rmse}', xy=(72, 1))
예제 #30
0
#Muli Index
#-----------------------------
#% Multindex in Pandas DF
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pydataset import data
data('iris')
data('titanic')
#https://github.com/iamaziz/PyDataset/blob/master/examples/sample-datasets.ipynb
mtcars = data('mtcars')
mtcars.head()
#mtcars.to_csv('data/mcarsdataset.csv')
data1 = mtcars
#describe data
data1.columns  #col names
data1.values# values of DF
data1.index
data1.dtypes
data1[['am','vs','cyl','gear','carb']] = data1[['am','vs', 'cyl','gear', 'carb']].astype('category')
#data1[['am','vs', 'cyl','gear', 'carb']].astype('category', inplace=False)
data1.dtypes
data1.iloc[0:3,0:4]
#reset the index : index to column
data2 = data1.reset_index()
data2.iloc[0:3,0:4]

data2.columns
data2.rename({'index':'carname'},inplace=True, axis='columns')
#rename index column to carname : old to new
data2.head()
예제 #31
0
plt.title('Total Bill vs Tip Amount')
tips.head()
sns.relplot(data=tips, y='tip', x='total_bill', hue='size')
sns.relplot(data=tips, y='tip', x='total_bill', hue='size', style='time')
sns.relplot(data=tips,
            y='tip',
            x='total_bill',
            hue='size',
            style='time',
            col='smoker')
sns.relplot(data=tips, y='tip', x='total_bill', style='size')
get_ipython().run_line_magic('pinfo', 'sns.relplot')
tips[['total_bill', 'tip']]
tips[['total_bill', 'tip']].corr()
from pydataset import data
data('mpg')
cars
cars = data('mpg')
cars.head()
cars.corr()
from matplotlib import cm
sns.heatmap(cars.corr(), cmap=cm.PiYG, annot=True, center=0)
sns.heatmap(cars.corr(), cmap=cm.PiYG, annot=True, center=0)
cm.PiYG
from matplotlib import cm
sns.heatmap(cars.corr(), cmap=cm.Blues, annot=True, center=0)
sns.heatmap(cars.corr(), cmap=cm.Blues, annot=True, center=0)
sns.distplot(cars.hwy)
sns.boxplot(data=tips, y='tip')
sns.boxplot(data=tips, y='tip', x='time')
sns.boxplot(data=tips, y='time', x='tip')
예제 #32
0
def get_db_url(username, hostname, password, database):
    url = f'mysql+pymysql://{username}:{password}@{hostname}/{database}'
    return url

def remove_commas_and_dollarsign(string_num):  
    """ 
    Cleans off a starting $ and commas where-ever they are. Takes a string, returns a string 
    """ 
    
    x = string_num.replace(',','')  
    x = x.strip("$")  
    return x  
######################
#####the main script!

mpg = data('mpg')

# 1
# On average, which manufacturer has the best miles per gallon?

# This groups the manufacturers by their hwy mileage.
mpg.groupby('manufacturer').hwy.agg(['min', 'mean', 'max'])
# I'm gonna have to make a new column that is the average of city and highway mileage.
mpg['mileage'] = (mpg['cty'] + mpg['hwy']) / 2
#Let's group the on this new column
mpg.groupby('manufacturer').mileage.agg(['min', 'mean', 'max'])
#Now sort them and see who's at the bottom
mpg.groupby('manufacturer').mileage.agg(['min', 'mean', 'max']).sort_values(by='mean').tail(1)
#Looks like Honda is the winner

예제 #33
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt?

#%%basic scatter plot
x = [5,7,8,7,2,17,2,9,4,11,12,9,6]
y = [99,86,87,88,111,86,103,87,94,78,77,85,86]

plt.scatter(x, y)
plt.show();

#%%
#dataset
from pydataset import data
mtcars = data('mtcars')
#conda upgrade --all -y
df=mtcars

#%% scatter plot
#dim - x, y, shape(s), color(c), tranpsarency(alpha)
df.describe #summary
df.dtypes  #data types
df.columns
df['wt']; df['mpg']
plt.scatter(x='wt', y='mpg', data=df)
plt.show();

df.carb

df.carb.value_counts()
# Change target to target_names & merge with main dataframe
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df['species'].head()
#https://python-data-science.readthedocs.io/en/latest/datasets.html

#%%%

#not working
from rpy2.robjects import r, pandas2ri
def data(name): 
    return pandas2ri.ri2py(r[name])

#%%
#pip install pydataset
from pydataset import data
titanic = data('titanic')
titanic.head()    

#%%%
from sklearn import datasets
load_boston()  #        Load and return the boston house-prices dataset (regression).
load_iris()     #       Load and return the iris dataset (classification).
load_diabetes()  #      Load and return the diabetes dataset (regression).
load_digits([n_class])# Load and return the digits dataset (classification).
load_linnerud()  #      Load and return the linnerud dataset (multivariate regression).

from sklearn.datasets import load_iris
iris = load_iris()

#%%
import seaborn as sns
예제 #35
0
파일: pydaset.py 프로젝트: janpipek/boadata
 def load_children(self):
     for row in data().itertuples():
         child = PydatasetNode(row[1], row[2], self)
         if child:
             self.add_child(child)
예제 #36
0
import pandas as pd
from pydataset import data
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline


# Problem 1
print "Problem 1:"

nottem_data = data('nottem')
VADeaths_data = data('VADeaths')
Arbuthnot_data = data('Arbuthnot')

#data('nottem', show_doc=True)
print 'Nottem:'
print 'This data set contains the monthly average air temperatures at Nottingham Castle for 20 years.'
print 'Since this data set consists of a single series, a line plot is best.'
nottem_data.plot(y='nottem',legend=False)
plt.show()

#data('VADeaths', show_doc=True)
print 'VA Deaths:'
print 'This data set contains death rates in Virginia in 1940, cross-classified \
by age group, gender, and urban/rural.'
print 'Since this data set contains multiple categories, a bar plot is more effective.'
VADeaths_data.iloc[:][['Rural Male','Rural Female','Urban Male','Urban Female']].plot(kind='bar')
plt.show()

#data('Arbuthnot', show_doc=True)
#Arbuthnot_data.plot(subplots=True, layout=(7,1))