def predict(selected_columns): fit_model = joblib.load(model_file_name) predict_set = pd.read_csv(csvdata, names=COLUMNS, skipinitialspace=True, skiprows=1) del predict_set[LABEL] # predict_set_imputed = deepcopy(predict_set) remove_list = list(set(COLUMNS) - set(selected_columns)) predict_set_imputed = predict_set_imputed.drop( remove_list + ['marital', 'job', 'contact'], axis=1) predict_set_imputed['education'] = ternary_vectorizing( predict_set_imputed['education'], ['primary', 'secondary', 'tertiary']) predict_set_imputed['education'].replace('unknown', np.nan, inplace=True) predict_set_imputed.fillna(predict_set_imputed.mean(), inplace=True) predict_set_imputed = one_hot(predict_set_imputed, list(set(CATEGORICAL_COLUMNS_2).intersection( set(selected_columns)))) predict_set_sc_scaled_imputed = standard_scaler.fit_transform( predict_set_imputed) predict_set_sc_scaled_imputed = pd.DataFrame( predict_set_sc_scaled_imputed) print 'predict_set_sc_scaled_imputed', list( predict_set_sc_scaled_imputed.columns.values) label_pred = fit_model.predict(predict_set_sc_scaled_imputed) i = 0 label_pred = ['y'] + label_pred.tolist() result = [] with open(file_name, 'r') as csvinput: for row in csv.reader(csvinput): if i == 0: result.append(COLUMNS) else: row[-1] = 'no' if label_pred[i] == 0 else 'yes' result.append(row) i += 1 with open("output.csv", "w") as f: writer = csv.writer(f) writer.writerows(result) out = None with open('output.csv', 'r') as f: reader = csv.DictReader(f) out = [row for row in reader] return model_name, out
def validate(selected_columns): fit_model = joblib.load(model_file_name) validation_set = pd.read_csv(csvdata, names=COLUMNS, skipinitialspace=True, skiprows=1) validation_label_set = deepcopy(validation_set[LABEL]) del validation_set[LABEL] # validation_set_imputed = deepcopy(validation_set) remove_list = list(set(COLUMNS) - set(selected_columns)) validation_set_imputed = validation_set_imputed.drop( remove_list + ['marital', 'job', 'contact'], axis=1) validation_set_imputed['education'] = ternary_vectorizing( validation_set_imputed['education'], ['primary', 'secondary', 'tertiary']) validation_set_imputed['education'].replace('unknown', np.nan, inplace=True) validation_set_imputed.fillna(validation_set_imputed.mean(), inplace=True) validation_set_imputed = one_hot(validation_set_imputed, list(set(CATEGORICAL_COLUMNS_2).intersection( set(selected_columns)))) # validation_label_set = binary_vectorizing(validation_label_set, ['no', 'yes']) # validation_set_sc_scaled_imputed = standard_scaler.fit_transform( validation_set_imputed) validation_set_sc_scaled_imputed = pd.DataFrame( validation_set_sc_scaled_imputed) print 'validation_set_sc_scaled_imputed', list( validation_set_sc_scaled_imputed.columns.values) label_pred = fit_model.predict(validation_set_sc_scaled_imputed) cnf_matrix = confusion_matrix(validation_label_set, label_pred) np.set_printoptions(precision=2) print model_name, cnf_matrix tp, fp, fn, tn = confusion_matrix(validation_label_set, label_pred).ravel() validate_result = {'TrueNegative': tn, 'FalsePositive': fp, 'FalseNegative': fn, 'TruePositive': tp} return model_name, validate_result
standard_scaler = preprocessing.StandardScaler() ############################ training_set = pd.read_csv(BANK_TRAINING, names=COLUMNS, skipinitialspace=True, skiprows=1) training_label_set = deepcopy(training_set[LABEL]) del training_set[LABEL] # training_set_imputed = deepcopy(training_set) training_set_imputed = training_set_imputed.drop(['marital', 'job', 'contact'], axis=1) training_set_imputed['education'] = ternary_vectorizing( training_set_imputed['education'], ['primary', 'secondary', 'tertiary']) training_set_imputed['education'].replace('unknown', np.nan, inplace=True) training_set_imputed.fillna(training_set_imputed.mean(), inplace=True) training_set_imputed = one_hot(training_set_imputed, CATEGORICAL_COLUMNS_2) # training_label_set = binary_vectorizing(training_label_set, ['no', 'yes']) # training_set_sc_scaled_imputed = standard_scaler.fit_transform( training_set_imputed) training_set_sc_scaled_imputed = pd.DataFrame(training_set_sc_scaled_imputed) # ########################## validation_set = pd.read_csv(BANK_TESTING, names=COLUMNS, skipinitialspace=True,
def train(selected_columns): training_set = pd.read_csv(csvdata, names= COLUMNS, skipinitialspace=True, skiprows=1) training_label_set = deepcopy(training_set[LABEL]) del training_set[LABEL] training_set_imputed = deepcopy(training_set) remove_list = list(set(COLUMNS) - set(selected_columns)) training_set_imputed = training_set_imputed.drop(remove_list + ['marital', 'job', 'contact'], axis=1) training_set_imputed['education'] = ternary_vectorizing( training_set_imputed['education'], ['primary', 'secondary', 'tertiary']) training_set_imputed['education'].replace('unknown', np.nan, inplace=True) training_set_imputed.fillna(training_set_imputed.mean(), inplace=True) training_set_imputed = one_hot(training_set_imputed, list(set(CATEGORICAL_COLUMNS_2).intersection(set(selected_columns)))) # training_label_set = binary_vectorizing(training_label_set, ['no', 'yes']) # training_set_sc_scaled_imputed = standard_scaler.fit_transform( training_set_imputed) training_set_sc_scaled_imputed = pd.DataFrame( training_set_sc_scaled_imputed) fit_model = MODELS[model_name].fit(training_set_sc_scaled_imputed, training_label_set) # print fit_model.feature_importances_ importances = [] feature_importance_output = "" if (model_name == "rf"): importances = fit_model.feature_importances_ feature_importance_output = username + "_rf.json" elif (model_name == "lm"): importances = fit_model.coef_[0] feature_importance_output = username + "_lm.json" if (len(importances)): feature_names = training_set_imputed.keys() indices = np.argsort(importances)[::-1] df = pd.DataFrame(columns=['features', 'importance']) for f in range(training_set_sc_scaled_imputed.shape[1]): df.loc[f] = [feature_names[indices[f]], importances[indices[f]]] f_dict = {} for index, row in df.iterrows(): key = row['features'].split('_', 1)[0] print key, row['importance'] if key in f_dict: f_dict[key] += row['importance'] else: f_dict[key] = row['importance'] print f_dict if feature_importance_output: with open(feature_importance_output, 'w') as f2: json.dump(f_dict, f2) joblib.dump(fit_model, model_file_name) print model_name, model_file_name if not os.path.exists('existingmodel.json'): open('existingmodel.json', 'a').close() with open('existingmodel.json', 'r') as f: jd = {} try: jd = json.load(f) except: pass with open('existingmodel.json', 'w') as f1: jd[model_file_name] = selected_columns json.dump(jd, f1) if (not feature_importance_output): return model_name, fit_model.get_params() else: return model_name, fit_model.get_params(), feature_importance_output