예제 #1
0
def boston_housing_data():
    """Boston Housing dataset.

    Source : https://archive.ics.uci.edu/ml/datasets/Housing
    Number of samples : 506

    Continuous target variable : MEDV
    MEDV = Median value of owner-occupied homes in $1000's

    Dataset Attributes:

        - 1) CRIM      per capita crime rate by town
        - 2) ZN        proportion of residential land zoned for lots over
                 25,000 sq.ft.
        - 3) INDUS     proportion of non-retail business acres per town
        - 4) CHAS      Charles River dummy variable (= 1 if tract bounds
                 river; 0 otherwise)
        - 5) NOX       nitric oxides concentration (parts per 10 million)
        - 6) RM        average number of rooms per dwelling
        - 7) AGE       proportion of owner-occupied units built prior to 1940
        - 8) DIS       weighted distances to five Boston employment centres
        - 9) RAD       index of accessibility to radial highways
        - 10) TAX      full-value property-tax rate per $10,000
        - 11) PTRATIO  pupil-teacher ratio by town
        - 12) B        1000(Bk - 0.63)^2 where Bk is the prop. of b. by town
        - 13) LSTAT    % lower status of the population

    Returns
    --------
    X, y : [n_samples, n_features], [n_class_labels]
        X is the feature matrix with 506 housing samples as rows
        and 13 feature columns.
        y is a 1-dimensional array of the continuous target variable MEDV

    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/data/boston_housing_data/

    """
    data = read_csv(filename2path("boston-housing.csv"), index_col=0)
    return df2Xy(data, 'medv')
예제 #2
0
def kaggle_gmsc_data_nomissing():
    data = read_csv(filename2path("give-me-some-credit"), index_col=0)
    data.dropna(inplace=True)
    return df2Xy(data, 'SeriousDlqin2yrs')
예제 #3
0
# du -h train_full_raw.csv
# wc -l train_full_raw.csv
# ```

# + [markdown] slideshow={"slide_type": "slide"}
# We'll go through this data by "chunks". Let's set the size of a chunk (can be accessed by papermill):

# + tags=["parameters"]
CHUNK_SIZE = 1000  # small value for fast execution; can bump to 10000
# -

# Let's create a chunk "reader":

from mlxtend.utils.data import filename2path
from pandas import read_csv
TRAIN_FILE = filename2path("avazu")
HEADER = ['id','click','hour','C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id'\
        ,'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21']
reader = read_csv(TRAIN_FILE,
                  chunksize=CHUNK_SIZE,
                  names=HEADER,
                  header=0,
                  index_col=0)

reader

# + [markdown] slideshow={"slide_type": "slide"}
# There are about 4,000 chunks of size 10,000 in this data.
#
# Let's look at the first chunk:
# -
예제 #4
0
def kaggle_house_prices():
    data = read_csv(filename2path("house-prices"), index_col=0)
    data.dropna(inplace=True)
    return df2Xy(data)
예제 #5
0
def hotel_reviews_data():
    data = read_csv(filename2path("hotel-reviews.csv"))
    X = data.text.values
    y = data.label.values
    return X, y