Exemplo n.º 1
0
def data_preprocessing(data):
    # Split the data into features and target label
    income_raw = data['income']
    features_raw = data.drop('income', axis=1)

    # Visualize skewed continuous features of original data
    vs.distribution(data)

    # Log-transform the skewed features
    skewed = ['capital-gain', 'capital-loss']
    features_log_transformed = pd.DataFrame(data=features_raw)
    features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

    # Visualize the new log distributions
    vs.distribution(features_log_transformed, transformed=True)
    # Initialize a scaler, then apply it to the features
    scaler = MinMaxScaler()  # default=(0, 1)
    numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

    features_log_minmax_transform = pd.DataFrame(data=features_log_transformed)
    features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

    # Show an example of a record with scaling applied
    print(features_log_minmax_transform.head(n=5))

    # One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
    features_final = pd.get_dummies(features_log_minmax_transform,
                                    prefix=['workclass', 'education_level', 'marital-status', 'occupation',
                                            'relationship', 'race', 'sex', 'native-country'],
                                    columns=['workclass', 'education_level', 'marital-status', 'occupation',
                                             'relationship', 'race', 'sex', 'native-country'])

    # Encode the 'income_raw' data to numerical values
    encoder = LabelEncoder()
    income = encoder.fit_transform(income_raw)

    # Print the number of features after one-hot encoding
    encoded = list(features_final.columns)
    print("{} total features after one-hot encoding.".format(len(encoded)))

    # Uncomment the following line to see the encoded feature names
    print(encoded)
    print(income)

    return features_final, income
Exemplo n.º 2
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 20 13:22:41 2019

@author: cidm
"""

import pandas as pd
import visuals as vs
import numpy as np

data = pd.read_csv('census.csv')
vs.distribution(data).savefig('skewed_graph.jpg')

income_raw = data['income']
features_raw = data.drop('income', axis=1)

skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data=features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(
    lambda x: np.log(x + 1))

# Visualize the new log distributions
vs.distribution(features_log_transformed,
                transformed=True).savefig('log_transformed_graph.png')

#Scaling the features
from sklearn.preprocessing import MinMaxScaler
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()  #default=(0, 1)
Exemplo n.º 3
0
# TODO: Number of records where individual's income is more than $50,000
n_greater_50k = data[data.income == '>50K'].shape[0]

# TODO: Number of records where individual's income is at most $50,000
n_at_most_50k = data[data.income == '<=50K'].shape[0]

# TODO: Percentage of individuals whose income is more than $50,000
greater_percent = (float(n_greater_50k) / n_records) * 100

# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis=1)

# Visualize skewed continuous features of original data
vs.distribution(data)

# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = [
    'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'
]
features_raw[numerical] = scaler.fit_transform(data[numerical])

# TODO: One-hot encode the 'features_raw' data using pandas.get_dummies()

# Initialize features dataframe with the numerical features
features = features_raw[numerical]
Exemplo n.º 4
0
# ## Preparing the Data
# Before data can be used as input for machine learning algorithms, it often must be cleaned, formatted, and restructured — this is typically known as **preprocessing**. Fortunately, for this dataset, there are no invalid or missing entries we must deal with, however, there are some qualities about certain features that must be adjusted. This preprocessing can help tremendously with the outcome and predictive power of nearly all learning algorithms.

# ### Transforming Skewed Continuous Features
# A dataset may sometimes contain at least one feature whose values tend to lie near a single number, but will also have a non-trivial number of vastly larger or smaller values than that single number.  Algorithms can be sensitive to such distributions of values and can underperform if the range is not properly normalized. With the census dataset two features fit this description: '`capital-gain'` and `'capital-loss'`.
#
# Run the code cell below to plot a histogram of these two features. Note the range of the values present and how they are distributed.

# In[5]:

# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis=1)

# Visualize skewed continuous features of original data
vs.distribution(data)

# For highly-skewed feature distributions such as `'capital-gain'` and `'capital-loss'`, it is common practice to apply a <a href="https://en.wikipedia.org/wiki/Data_transformation_(statistics)">logarithmic transformation</a> on the data so that the very large and very small values do not negatively affect the performance of a learning algorithm. Using a logarithmic transformation significantly reduces the range of values caused by outliers. Care must be taken when applying this transformation however: The logarithm of `0` is undefined, so we must translate the values by a small amount above `0` to apply the the logarithm successfully.
#
# Run the code cell below to perform a transformation on the data and visualize the results. Again, note the range of values and how they are distributed.

# In[6]:

# Log-transform the skewed features
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data=features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(
    lambda x: np.log(x + 1))

# Visualize the new log distributions
vs.distribution(features_log_transformed, transformed=True)
Exemplo n.º 5
0
# ## Preparing the Data
# Before data can be used as input for machine learning algorithms, it often must be cleaned, formatted, and restructured — this is typically known as **preprocessing**. Fortunately, for this dataset, there are no invalid or missing entries we must deal with, however, there are some qualities about certain features that must be adjusted. This preprocessing can help tremendously with the outcome and predictive power of nearly all learning algorithms.

# ### Transforming Skewed Continuous Features
# A dataset may sometimes contain at least one feature whose values tend to lie near a single number, but will also have a non-trivial number of vastly larger or smaller values than that single number.  Algorithms can be sensitive to such distributions of values and can underperform if the range is not properly normalized. With the census dataset two features fit this description: '`capital-gain'` and `'capital-loss'`.
#
# Run the code cell below to plot a histogram of these two features. Note the range of the values present and how they are distributed.

# In[3]:

# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis=1)

# Visualize skewed continuous features of original data
vs.distribution(data)

# For highly-skewed feature distributions such as `'capital-gain'` and `'capital-loss'`, it is common practice to apply a <a href="https://en.wikipedia.org/wiki/Data_transformation_(statistics)">logarithmic transformation</a> on the data so that the very large and very small values do not negatively affect the performance of a learning algorithm. Using a logarithmic transformation significantly reduces the range of values caused by outliers. Care must be taken when applying this transformation however: The logarithm of `0` is undefined, so we must translate the values by a small amount above `0` to apply the the logarithm successfully.
#
# Run the code cell below to perform a transformation on the data and visualize the results. Again, note the range of values and how they are distributed.

# In[4]:

# Log-transform the skewed features
skewed = ['capital-gain', 'capital-loss']
features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1))

# Visualize the new log distributions
vs.distribution(features_raw, transformed=True)

# ### Normalizing Numerical Features
Exemplo n.º 6
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 23 15:41:00 2018

@author: apple
"""

# 为这个项目导入需要的库
import numpy as np
import pandas as pd
from time import time
from IPython.display import display  # 允许为DataFrame使用display()

# 导入附加的可视化代码visuals.py
import visuals as vs

# 导入人口普查数据
data = pd.read_csv("census.csv")

# 成功 - 显示第一条记录
display(data.head(n=1))

# 将数据切分成特征和对应的标签
income_raw = data['income']
features_raw = data.drop('income', axis=1)

# 可视化 'capital-gain'和'capital-loss' 两个特征
vs.distribution(features_raw)
Exemplo n.º 7
0
# Print the results

print("Total number of wine data: {}".format(n_wines))
print("Wines with rating 7 and above: {}".format(n_above_6))
print("Wines with rating less than 5: {}".format(n_below_5))
print("Wines with rating 5 and 6: {}".format(n_between_5))
print("Percentage of wines with quality 7 and above: {:.2f}%".format(
    greater_percent))

# ### Run the following cell block to see the distributions on a graph:

# In[11]:

# TODO: Visualize skewed continuous features of original data
vs.distribution(data, "quality")

# ### Get useful statistics, such as mean, median and standard deviation of the features:

# In[12]:

#TODO: Get some additional statistics, like mean, median and standard deviation
# Some more additional data analysis
display(np.round(data.describe()))

# As we can see, most fines fall under **average quality (between 5 and 6)**. Wines which were rated high are in the lower hundreds, whereas there are very few wines that aren't tasty enough (low ratings).
#
# Next, since our aim is to predict the quality of wines, we’ll now extract the last column and store it separately.

# ## Section 2: Exploring Relationships between features
Exemplo n.º 8
0
n_records = df.shape[0]
n_greater_50k = sum(df['income'] == '>50K')
n_at_most_50k = sum(df['income'] == '<=50K')
greater_percent = n_greater_50k / n_records * 100

# Print the results
print("Total number of records: {}".format(n_records))
print("Individuals making more than $50,000: {}".format(n_greater_50k))
print("Individuals making at most $50,000: {}".format(n_at_most_50k))
print("Percentage of individuals making more than $50,000: {}%".format(
    greater_percent))

income_raw = df['income']
features_raw = df.drop('income', axis=1)

vs.distribution(df)

skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data=features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(
    lambda x: np.log(x + 1))

# Visualize the new log distributions
##vs.distribution(features_log_transformed, transformed = True)

scaler = MinMaxScaler()  # default=(0, 1)
numerical = [
    'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'
]

features_log_minmax_transform = pd.DataFrame(data=features_log_transformed)
# Percentage of individuals whose income is more than $50,000
greater_percent = float(len(data[data['income'] == '>50K'])) / float(len(data)) * 100

# Print the results
print "Total number of records: {}".format(n_records)
print "Individuals making more than $50,000: {}".format(n_greater_50k)
print "Individuals making at most $50,000: {}".format(n_at_most_50k)
print "Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent)


# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)

# Visualize skewed continuous features of original data
vs.distribution(data)

## Log-transform the skewed features
skewed = ['capital-gain', 'capital-loss']
features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1))

# Visualize the new log distributions
vs.distribution(features_raw, transformed = True)


# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']