def demo_rfc(DATA_DIR,FEATURES,RFC): """ This is the demo function for rfc Args: DATA_DIR: data directory path FEATURES: feature of the model RFC: loaded rfc model """ import json import pandas as pd #read in meta data json file , contains category encoding for iso3 with open(DATA_DIR + "/" +'meta.json') as f: meta_data = json.load(f) iso_dict = meta_data[3] #RFC = load_model('../../examples/finalized_roof_model.sav') #prompt user to enter data yrs = int(input('Type house initial year: ')) hf = int(input('Type the floor number: ')) hw = int(input('Type the wall number: ')) iso = input('Type the country iso3 code: ') user_input = [yrs,hf,hw,iso_dict[iso]] #prediction based on user input input_ = pd.DataFrame(data=[user_input],columns=FEATURES[:4]) input_ = prep.ranking(input_,['wall','floor']) ans = RFC.predict(input_) print('Predicted as rank {}.'.format(ans[0]))
sys.path.append('../hp_classify') import prep.prep_data as prep import model.rfc_build as rf #Gobals FILEPATH = '../data/housing_data.csv' LABEL = 'roof' ATTR = [ 'int_year', 'housing_roof_num', 'housing_wall_num', 'housing_floor_num', 'iso3' ] VAR = ['roof', 'floor', 'wall'] #test setup for rfc model data preprocessing functions df = prep.load_data(FILEPATH) df = prep.ranking(df, VAR) FEATURES = prep.extract_features(df, LABEL) df, RANK_NUM = prep.shuffle_redistribute(df, LABEL) x_train, x_test, y_train, y_test = prep.train_test_split(df, FEATURES, LABEL) RFC = rf.rfc_model(x_train, y_train, LABEL) pred_test = RFC.predict(x_test) c_matrix = rf.confusion_matrix(pred_test, y_test) loaded_model = rf.load_model('finalized_roof_model.sav') def test_rfc_build(): """This function test if the rfc is correctly generated """ #assert that the rfc is a sklearn rfc model assert type(RFC) == sklearn.ensemble.forest.RandomForestClassifier
STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident'] RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n'] # read in the df using our function in order to pass to later tests # read in df using your function and then using pandas regular csv read, then compare the resulting dfs df = prep.read_then_clean(FILEPATH, CLEAN_COLS) raw_csv = pd.read_csv(FILEPATH) # also passed it through the rest of the cleaning pipeline on order to compare df to df_clean df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE) df_clean = prep.extract_ranking(df_clean, NUM_VARS) df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE) # test setup for rfc model data preprocessing functions df_rfc = prep.load_data(FILEPATH) df_rank_check = prep.ranking(df_rfc, VAR) FEATURES = prep.extract_features(df_rank_check, LABEL) df_shuffle_check, RANK_NUM = prep.shuffle_redistribute(df_rank_check, LABEL) x_train, x_test, y_train, y_test = prep.train_test_split( df_shuffle_check, FEATURES, LABEL) def test_globals(): """This function tests that the test globals are properly defined. """ # assert that digits are removed assert re_dig.search(DIGITS) != None, "global doesn't contain digits!" # assert that punctutation is removed assert re_punct.search( PUNCT) != None, "global doesn't contain punctuation!" # assert that excessive whitespace is removed