예제 #1
0
def split_dfo(dfo,
              train_pct=.7,
              randomer=None,
              stratify=None,
              drop_cols=None,
              splain=local_settings.splain,
              **kwargs):
    '''
    scale_dfo(dfo, scaler_fn=standard_scaler, **kwargs)
    RETURNS: dfo object with heaping piles of context enclosed
    
    scaler_fn must be a function
    dummy val added to train and test to allow for later feature selection testing
    '''
    dfo.randomer = randomer
    dfo.stratify = stratify if stratify is not None else dfo.y_column
    dfo.train_pct = train_pct
    dfo.drop_cols = drop_cols
    df2 = pd.DataFrame(dfo.df)
    df2 = remove_cols(df=df2, cols=drop_cols)
    dfo.train, dfo.test = split_my_data_whole(df=df2,
                                              target_column=dfo.y_column,
                                              stratify=dfo.stratify,
                                              random_state=dfo.randomer)
    dfo.train_index = dfo.train.index
    frame_splain(dfo.train, 'DFO Train', splain=splain)
    dfo.test_index = dfo.test.index
    frame_splain(dfo.test, 'DFO Test', splain=splain)
    return dfo
예제 #2
0
def df_join_xy(X, y):
    '''
    df_join_xy(X, y)
    RETURNS dataframe X.join(y)

    Allows reconfigurations of X and y based on train or test and scaled or unscaled    
    '''
    join_df = X.join(y)
    frame_splain(join_df, 'join df')
    return join_df
예제 #3
0
def rename_fields(dataframe):
    '''
    rename_fields(dataframe)
    
    '''
    columns = dataframe.columns.tolist()
    renames = {k: v for k, v in _global_renames if k in columns}
    renamed_df = dataframe.rename(columns=renames)
    frame_splain(renamed_df, title='renamed df')
    return renamed_df
def check_df(dataframe, *args, splain=local_settings.splain, **kwargs):
    '''
    check_df(dataframe, splain=local_settings.splain, **kwargs)
    RETURNS: dataframe

    This function receives any dataframe, replaces null values with np.nan 
    and passes it through frame_splain(). If splain is true, frame_splain()
    will produce a report on the dataframe.
    '''
    dataframe.fillna(value=np.nan, inplace=True)
    frame_splain(dataframe, splain=splain, **kwargs)
    return dataframe
예제 #5
0
def xy_df(dataframe, y_column):
    '''
    xy_df(dataframe, y_column)
    RETURNS X_df, y_df

    Pass in one dataframe of observed data and the name of the target column. Returns dataframe of all columns except the target column and dataframe of just the target column.

    If y_column is a list, more than one column can be separated.
    '''
    X_df = dataframe.drop([y_column], axis=1)
    frame_splain(X_df, title='X')
    y_df = pd.DataFrame(dataframe[y_column])
    frame_splain(y_df, title='y')
    return X_df, y_df
def wrangle_zillow(db='zillow', sql='zillow_sql', sql_string=False):
    '''
    wrangle_zillow(db='zillow', sql='zillow_sql', sql_string=False)
    RETURNS result_df

    Pass database name ('zillow' by default) and either preset sql with 
    sql_string=False (default) or sql statement with sql_string=True.

    Produces results of SQL statement in a dataframe object.

    *** Requires user, password, and host from env.py ***
    '''
    get_database = db
    zillow_url = get_db_url(user=user, password=password, host=host, database=get_database)
    use_sql = sql if sql_string else get_sql(sql='zillow_sql')
    result_df = pd.read_sql(use_sql, zillow_url)
    frame_splain(result_df, topx=5, maxcols=10)
    return result_df    
예제 #7
0
def edit_prep_df(dataframe):
    '''
    set_base_df(dataframe)
    RETURN prepped_df

    Gets basic dataframe for MVP objective. Features include bathrooms, 
    bedrooms, and square footage. Target variable is 'taxable_value'
    '''

    keep_fields = [
        'nbr_bthrms', 'nbr_bedrms', 'finished_sqft', 'taxable_value'
    ]
    prepped_df = dataframe[keep_fields]

    frame_splain(prepped_df, title='prepped df')
    return prepped_df


# print('Got Prep')
예제 #8
0
def scale_dfo(dfo, scaler_fn=standard_scaler, splain=local_settings.splain, **kwargs):
    '''
    scale_dfo(dfo, scaler_fn=standard_scaler, **kwargs)
    RETURNS: dfo object with heaping piles of context enclosed
    
    scaler_fn must be a function
    dummy val added to train and test to allow for later feature selection testing
    '''

    dfo.scaler_fn = scaler_fn
    if scaler_fn is None:
        dfo.scaler = None
    else:
        dfo.scaler, dfo.train_scaled, dfo.test_scaled = scaler_fn(train=dfo.train, test=dfo.test)
        dfo.train_scaled['dummy_val']=1
        dfo.test_scaled['dummy_val']=1
    dfo.train['dummy_val']=1
    dfo.test['dummy_val']=1
    dfo.X_train, dfo.y_train = xy_df(dataframe=dfo.train, y_column=dfo.y_column)
    dfo.X_test, dfo.y_test = xy_df(dataframe=dfo.test, y_column=dfo.y_column)
    frame_splain(dfo.X_train, 'X_Train', splain=splain)
    frame_splain(dfo.y_train, 'y_Train', splain=splain)
    frame_splain(dfo.X_test, 'X_Test', splain=splain)
    frame_splain(dfo.y_test, 'Y_Test', splain=splain)
    if scaler_fn is not None:
        dfo.X_train_scaled, dfo.y_train_scaled = xy_df(dataframe=dfo.train_scaled, y_column=dfo.y_column)
        dfo.X_test_scaled, dfo.y_test_scaled = xy_df(dataframe=dfo.test_scaled, y_column=dfo.y_column)
        frame_splain(dfo.X_train_scaled, 'X_Train_scaled', splain=splain)
        frame_splain(dfo.y_train_scaled, 'y_Train_scaled', splain=splain)
        frame_splain(dfo.X_test_scaled, 'X_Test_scaled', splain=splain)
        frame_splain(dfo.y_test_scaled, 'Y_Test_scaled', splain=splain)
    
    return dfo