示例#1
0
def demo_rfc(DATA_DIR,FEATURES,RFC):
    """
    This is the demo function for rfc
    
    Args: 
        DATA_DIR: data directory path
        FEATURES: feature of the model
        RFC: loaded rfc model
        
    """
    import json
    import pandas as pd

    #read in meta data json file , contains category encoding for iso3
    with open(DATA_DIR + "/" +'meta.json') as f:
        meta_data = json.load(f)
    iso_dict = meta_data[3]
    
    #RFC = load_model('../../examples/finalized_roof_model.sav')

    #prompt user to enter data
    yrs = int(input('Type house initial year: '))
    hf = int(input('Type the floor number: '))
    hw = int(input('Type the wall number: '))
    iso = input('Type the country iso3 code: ')
    user_input = [yrs,hf,hw,iso_dict[iso]]
    
    #prediction based on user input
    input_ = pd.DataFrame(data=[user_input],columns=FEATURES[:4])
    input_ = prep.ranking(input_,['wall','floor'])
    ans = RFC.predict(input_)
    print('Predicted as rank {}.'.format(ans[0]))
sys.path.append('../hp_classify')
import prep.prep_data as prep
import model.rfc_build as rf

#Gobals
FILEPATH = '../data/housing_data.csv'
LABEL = 'roof'
ATTR = [
    'int_year', 'housing_roof_num', 'housing_wall_num', 'housing_floor_num',
    'iso3'
]
VAR = ['roof', 'floor', 'wall']

#test setup for rfc model data preprocessing functions
df = prep.load_data(FILEPATH)
df = prep.ranking(df, VAR)
FEATURES = prep.extract_features(df, LABEL)
df, RANK_NUM = prep.shuffle_redistribute(df, LABEL)
x_train, x_test, y_train, y_test = prep.train_test_split(df, FEATURES, LABEL)
RFC = rf.rfc_model(x_train, y_train, LABEL)
pred_test = RFC.predict(x_test)
c_matrix = rf.confusion_matrix(pred_test, y_test)
loaded_model = rf.load_model('finalized_roof_model.sav')


def test_rfc_build():
    """This function test if the rfc is correctly generated 
    """
    #assert that the rfc is a sklearn rfc model
    assert type(RFC) == sklearn.ensemble.forest.RandomForestClassifier
示例#3
0
STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident']
RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n']

# read in the df using our function in order to pass to later tests
# read in df using your function and then using pandas regular csv read, then compare the resulting dfs
df = prep.read_then_clean(FILEPATH, CLEAN_COLS)
raw_csv = pd.read_csv(FILEPATH)

# also passed it through the rest of the cleaning pipeline on order to compare df to df_clean
df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE)
df_clean = prep.extract_ranking(df_clean, NUM_VARS)
df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE)

# test setup for rfc model data preprocessing functions
df_rfc = prep.load_data(FILEPATH)
df_rank_check = prep.ranking(df_rfc, VAR)
FEATURES = prep.extract_features(df_rank_check, LABEL)
df_shuffle_check, RANK_NUM = prep.shuffle_redistribute(df_rank_check, LABEL)
x_train, x_test, y_train, y_test = prep.train_test_split(
    df_shuffle_check, FEATURES, LABEL)


def test_globals():
    """This function tests that the test globals are properly defined.
    """
    # assert that digits are removed
    assert re_dig.search(DIGITS) != None, "global doesn't contain digits!"
    # assert that punctutation is removed
    assert re_punct.search(
        PUNCT) != None, "global doesn't contain punctuation!"
    # assert that excessive whitespace is removed