def feature_pipeline(primary_df, fit_df): """Engineer features for complete dataset Parameters ---------- primary_df : DataFrame Dataset to engineer; always used to transform StandardScaler() trans_df : DataFrame Dataset used to fit StandardScaler() Returns ------- full_feature_df : DataFrame Complete dataset with re-engineered features """ # Reclassify the 'Round' feature accordingly (if it's even present) try: bidirectional_rounds_str_numeric(primary_df) except KeyError: pass # Convert team points/game features into point differential features team_points_differentials(primary_df) # Convert favorite-underdog features to a single class of underdog relative feature matchups_to_underdog_relative(primary_df) # 'Center the data' for all numerical features; improves models' signal processing abilities full_feature_df = scale_features(primary_df, fit_df) return full_feature_df
def load_data(datadir, tseg=1024.0, log_features=None, ranking=None): features, labels, lc, \ hr, tstart, nseg = feature_engineering.load_features(datadir, tseg, log_features=log_features, ranking=ranking) features_lb, labels_lb, lc_lb, \ hr_lb, tstart_lb, nseg_lb = feature_engineering.labelled_data(features, labels, lc, hr, tstart, nseg) fscaled, fscaled_lb = feature_engineering.scale_features(features, features_lb) fscaled_full = np.vstack([fscaled["train"], fscaled["val"], fscaled["test"]]) labels_all = np.hstack([labels["train"], labels["val"], labels["test"]]) return features, labels, lc, hr, tstart, nseg, \ features_lb, labels_lb, lc_lb, hr_lb, nseg_lb, \ fscaled, fscaled_lb, fscaled_full, labels_all
#create goal data in single currency X_train = currency_conversion(X_train) # drop unnecessary columns X_train = drop_columns(X_train) print('get_dummies') cat_columns = ['country', 'category_main', 'category_sub'] X_train = make_dummies(X_train, cat_columns) # address skew by applying logarithm num_columns = ['project_duration_days', 'blurb_length', 'usd_goal'] X_train = fix_skew(X_train, skewed=num_columns) # scale numerical features X_train = scale_features(X_train, num_columns) # resample X_train, y_train = rebalance(X_train, y_train) print("Feature engineering on train data complete") # initiate model print("Training a decision tree classifier") clf_tree = DecisionTreeClassifier(criterion="gini", max_depth=3, min_samples_leaf=5) # Performing training clf_tree.fit(X_train, y_train) # Create predictions using simple model - decision tree y_train_pred = clf_tree.predict(X_train)
X_test = currency_conversion(X_test) # drop unnecessary columns X_test = drop_columns(X_test) # This is FUTURE WORK # split categorical columns into dummies cat_columns=['country', 'category_main','category_sub'] X_test = make_dummies(X_test, cat_columns) # address skew by applying logarithm num_columns = ['project_duration_days', 'blurb_length', 'usd_goal'] X_test = fix_skew(X_test, skewed=num_columns) # scale numerical features X_test = scale_features(X_test, num_columns) print("Feature engineering on test data complete") # calculate predictions y_test_pred = loaded_model.predict(X_test) # print results print("Confusion Matrix: \n", confusion_matrix(y_test, y_test_pred)) print ("Accuracy : \n", accuracy_score(y_test, y_test_pred)*100) print("Report : \n", classification_report(y_test, y_test_pred))
] ### Load the dictionary containing the dataset with open("final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) ### Task 2: Remove outliers data_dict.pop('TOTAL') data_dict.pop('THE TRAVEL AGENCY IN THE PARK') data_dict.pop('LOCKHART EUGENE E') ### Task 3: Create new feature(s) new_data_dict = create_features(data_dict) #### Scale features for use in PCA new_data_dict = scale_features(new_data_dict) ### Store to my_dataset for easy export below. my_dataset = new_data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html from sklearn.pipeline import Pipeline