예제 #1
0
CPI.Average: This is the average Consumer Price Index associated with that year. It was used to calculate
2018-equivalent values.

High.2018: This is the 2018-equivalent dollars for High.Value.

Low.2018: This is the 2018-equivalent dollars for Low.Value.
'''

# Displaying all columns without '...'
pd.options.display.width = 0

df = pd.read_csv('datasets/Minimum Wage Data.csv', encoding="latin")
df.to_csv('datasets/minwage.csv', encoding="utf-8")
df = pd.read_csv('datasets/minwage.csv')

hdr("df.head()")
print(df.head())

# Now we use group-by:
gb = df.groupby('State')

hdr("gb.get_group('Alabama').set_index('Year').head()")
print(gb.get_group('Alabama').set_index('Year').head())

# The above is the same as the following syntax, but using pandas instead of regular Python
# print(df[df['State'] == 'Alabama'].set_index('Year').head())

# Iterating over groups
act_min_wage = pd.DataFrame()

for name, group in df.groupby('State'):
예제 #2
0
import pandas as pd
import matplotlib.pyplot as plt
from helpers import hdr

# df is a dataframe
df = pd.read_csv("datasets/avocado.csv")

# First x rows
hdr("df.head(4)")
print(df.head(4))

# Last x rows
hdr("df.tail(5)")
print(df.tail(5))

# Get the column AveragePrice
hdr("df['AveragePrice'].head()")
print(df['AveragePrice'].head())

# The dot notation is uncommon! (df.AveragePrice.head())

# Get all rows where region == Albany
albany_df = df[df['region'] == "Albany"]
# The result is a new dataframe

hdr("albany_df.head()")
print(albany_df.head())

# Show the indexes of the dataframe
hdr("albany_df.index")
print(albany_df.index)
예제 #3
0
from helpers import hdr

pd.options.display.width = 0

df = pd.read_csv('datasets/minwage.csv')

act_min_wage = pd.DataFrame()

for name, group in df.groupby('State'):
    if act_min_wage.empty:
        act_min_wage = group.set_index('Year')[['Low.2018']].rename(columns={'Low.2018':name})
    else:
        act_min_wage = act_min_wage.join(group.set_index('Year')[['Low.2018']].rename(columns={'Low.2018':name}))

hdr('act_min_wage.head()')
print(act_min_wage.head())

min_wage_corr = act_min_wage.replace(0, np.NaN).dropna(axis=1).corr()

hdr('min_wage_corr.head()')
print(min_wage_corr.head())

plt.matshow(min_wage_corr)
plt.show()

labels = [c[:2].upper() for c in min_wage_corr.columns]

def show_heatmap(labels):
    fig = plt.figure(figsize=(12, 12))
    #                     <- all the subplots in the figure are in a 1 by 1 grid, and this is number one (only 1 graph)
예제 #4
0
import pandas as pd
import matplotlib.pyplot as plt

from helpers import hdr

df = pd.read_csv("datasets/avocado.csv")

# Convert the Date column to datetime objects
df['Date'] = pd.to_datetime(df['Date'])

hdr("df.head()")
print(df.head())

albany_df = df.copy()[df['region'] == "Albany"]
albany_df.set_index("Date", inplace=True)

albany_df["AveragePrice"].plot()
plt.show()

# Moving average of size 25
albany_df["AveragePrice"].rolling(25).mean().plot()
plt.show()
# It doesn't look right.
# We need to sort the dataset by the Date, which is the index:
albany_df.sort_index(inplace=True)

albany_df["AveragePrice"].rolling(25).mean().plot(
    title="AveragePrice with rolling average=25")
plt.show()

# Put this data as a new column in our dataframe:
예제 #5
0
import pandas as pd

from helpers import hdr

pd.options.display.width = 0

df = pd.read_csv("datasets/diamonds.csv", index_col=0)

hdr("df.head()")
print(df.head())

# Starting with linear regression
# It would be ideal that our string classifications are linear, meaning they have a meaningful order.
# Let's see what all of our cuts are, for example
hdr("df['cut'].unique()")
print(df['cut'].unique())

# Now we hard-code the order:
cut_class_dict = {
    'Fair': 1,
    'Good': 2,
    'Very Good': 3,
    'Premium': 4,
    'Ideal': 5
}

# Let's see clarity
hdr("df['clarity'].unique()")
print(df['clarity'].unique())
clarity_dict = {
    "I3": 1,
예제 #6
0
import pandas as pd
import numpy as np

from helpers import hdr

pd.options.display.width = 0

# In this lesson we'll try to find correlation between unemployment and the minimum wage

unemp_county = pd.read_csv('datasets/unemployment-by-county-us/output.csv')

hdr('unemp_county.head()')
print(unemp_county.head())

df = pd.read_csv('datasets/minwage.csv')

act_min_wage = pd.DataFrame()

for name, group in df.groupby('State'):
    if act_min_wage.empty:
        act_min_wage = group.set_index('Year')[[
            'Low.2018'
        ]].rename(columns={'Low.2018': name})
    else:
        act_min_wage = act_min_wage.join(
            group.set_index('Year')[['Low.2018'
                                     ]].rename(columns={'Low.2018': name}))

hdr('act_min_wage.head()')
print(act_min_wage.head())