def feature_pipeline(primary_df, fit_df):
    """Engineer features for complete dataset

    Parameters
    ----------
    primary_df : DataFrame
        Dataset to engineer; always used to transform StandardScaler()
    trans_df : DataFrame
        Dataset used to fit StandardScaler()

    Returns
    -------
    full_feature_df : DataFrame
        Complete dataset with re-engineered features
    """
    # Reclassify the 'Round' feature accordingly (if it's even present)
    try:
        bidirectional_rounds_str_numeric(primary_df)
    except KeyError:
        pass

    # Convert team points/game features into point differential features
    team_points_differentials(primary_df)

    # Convert favorite-underdog features to a single class of underdog relative feature
    matchups_to_underdog_relative(primary_df)

    # 'Center the data' for all numerical features; improves models' signal processing abilities
    full_feature_df = scale_features(primary_df, fit_df)

    return full_feature_df
def load_data(datadir, tseg=1024.0, log_features=None, ranking=None):
    features, labels, lc, \
    hr, tstart, nseg = feature_engineering.load_features(datadir, tseg,
                                                   log_features=log_features,
                                                   ranking=ranking)

    features_lb, labels_lb, lc_lb, \
    hr_lb, tstart_lb, nseg_lb = feature_engineering.labelled_data(features, labels,
                                                         lc, hr, tstart, nseg)

    fscaled, fscaled_lb = feature_engineering.scale_features(features,
                                                             features_lb)

    fscaled_full = np.vstack([fscaled["train"], fscaled["val"],
                              fscaled["test"]])

    labels_all = np.hstack([labels["train"], labels["val"], labels["test"]])

    return features, labels, lc, hr, tstart, nseg, \
           features_lb, labels_lb, lc_lb, hr_lb, nseg_lb, \
           fscaled, fscaled_lb, fscaled_full, labels_all
示例#3
0
def load_data(datadir, tseg=1024.0, log_features=None, ranking=None):
    features, labels, lc, \
    hr, tstart, nseg = feature_engineering.load_features(datadir, tseg,
                                                   log_features=log_features,
                                                   ranking=ranking)

    features_lb, labels_lb, lc_lb, \
    hr_lb, tstart_lb, nseg_lb = feature_engineering.labelled_data(features, labels,
                                                         lc, hr, tstart, nseg)

    fscaled, fscaled_lb = feature_engineering.scale_features(features,
                                                             features_lb)

    fscaled_full = np.vstack([fscaled["train"], fscaled["val"],
                              fscaled["test"]])

    labels_all = np.hstack([labels["train"], labels["val"], labels["test"]])

    return features, labels, lc, hr, tstart, nseg, \
           features_lb, labels_lb, lc_lb, hr_lb, nseg_lb, \
           fscaled, fscaled_lb, fscaled_full, labels_all
#create goal data in single currency
X_train = currency_conversion(X_train)

# drop unnecessary columns
X_train = drop_columns(X_train)

print('get_dummies')
cat_columns = ['country', 'category_main', 'category_sub']
X_train = make_dummies(X_train, cat_columns)

# address skew   by applying logarithm
num_columns = ['project_duration_days', 'blurb_length', 'usd_goal']
X_train = fix_skew(X_train, skewed=num_columns)

# scale numerical features
X_train = scale_features(X_train, num_columns)

# resample
X_train, y_train = rebalance(X_train, y_train)
print("Feature engineering on train data complete")

# initiate model
print("Training a decision tree classifier")
clf_tree = DecisionTreeClassifier(criterion="gini",
                                  max_depth=3,
                                  min_samples_leaf=5)
# Performing training
clf_tree.fit(X_train, y_train)
# Create predictions using simple model - decision tree
y_train_pred = clf_tree.predict(X_train)
示例#5
0
X_test = currency_conversion(X_test)

# drop unnecessary columns
X_test = drop_columns(X_test)
    
# This is FUTURE WORK
# split categorical columns into dummies
cat_columns=['country', 'category_main','category_sub']
X_test = make_dummies(X_test, cat_columns)

# address skew   by applying logarithm  
num_columns = ['project_duration_days', 'blurb_length', 'usd_goal']
X_test = fix_skew(X_test, skewed=num_columns)

# scale numerical features
X_test = scale_features(X_test, num_columns)
print("Feature engineering on test data complete")

# calculate predictions
y_test_pred = loaded_model.predict(X_test)

# print results
print("Confusion Matrix: \n", 
confusion_matrix(y_test, y_test_pred)) 
    
print ("Accuracy : \n", 
accuracy_score(y_test, y_test_pred)*100) 
    
print("Report : \n", 
classification_report(y_test, y_test_pred)) 
示例#6
0
文件: poi_id.py 项目: cfergusonlee/p5
]

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')
data_dict.pop('LOCKHART EUGENE E')

### Task 3: Create new feature(s)
new_data_dict = create_features(data_dict)

#### Scale features for use in PCA
new_data_dict = scale_features(new_data_dict)

### Store to my_dataset for easy export below.
my_dataset = new_data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

from sklearn.pipeline import Pipeline