def multivariate_gaussian(x_train, y_train, x_test, y_test, outdir): # Score with Multivariate Gaussian # Transform data using boxcox transform, and fit multivariate gaussians. x_train_boxcox, x_test_boxcox = mv.transform_features(x_train, x_test) rv_pos, rv_neg = mv.fit_gaussians(x_train_boxcox, y_train) # Compute melodiness scores on train and test set m_train, m_test = mv.compute_all_melodiness(x_train_boxcox, x_test_boxcox, rv_pos, rv_neg) # Compute various metrics based on melodiness scores. melodiness_scores = mv.melodiness_metrics(m_train, m_test, y_train, y_test) best_thresh, max_fscore, thresh_plot_data = \ eu.get_best_threshold(y_test, m_test) # THIS SHOULD PROBABLY BE VALIDATION NUMBERS... # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(), # columns=['recall', 'precision', # 'thresh', 'f1']) # fpath = os.path.join(outdir, 'thresh_plot_data.csv') # thresh_plot_data.to_csv(fpath) melodiness_scores = pd.DataFrame.from_dict(melodiness_scores) fpath = os.path.join(outdir, 'melodiness_scores.csv') melodiness_scores.to_csv(fpath) print "Melodiness best thresh = %s" % best_thresh print "Melodiness max f1 score = %s" % max_fscore print "overall melodiness scores:" print melodiness_scores
def classifier(x_train, y_train, x_valid, y_valid, x_test, y_test, outdir): """ Train Classifier """ # Cross Validation best_depth, _, cv_plot_data = cu.cross_val_sweep(x_train, y_train) print "Classifier best depth = %s" % best_depth cv_plot_data = pd.DataFrame(np.array(cv_plot_data).transpose(), columns=['max depth', 'accuracy', 'std']) fpath = os.path.join(outdir, 'cv_plot_data.csv') cv_plot_data.to_csv(fpath) # Training clf = cu.train_clf(x_train, y_train, best_depth) # Predict and Score p_train, p_valid, p_test = cu.clf_predictions(x_train, x_valid, x_test, clf) clf_scores = cu.clf_metrics(p_train, p_test, y_train, y_test) print "Classifier scores:" print clf_scores # Get threshold that maximizes F1 score best_thresh, max_fscore, thresh_plot_data = \ eu.get_best_threshold(y_valid, p_valid) # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(), # columns=['recall', 'precision', # 'thresh', 'f1']) # fpath = os.path.join(outdir, 'thresh_plot_data.csv') # thresh_plot_data.to_csv(fpath) clf_scores = pd.DataFrame.from_dict(clf_scores) fpath = os.path.join(outdir, 'classifier_scores.csv') clf_scores.to_csv(fpath) clf_outdir = os.path.join(outdir, 'classifier') if not os.path.exists(clf_outdir): os.mkdir(clf_outdir) clf_fpath = os.path.join(clf_outdir, 'rf_clf.pkl') joblib.dump(clf, clf_fpath) print "Classifier best threshold = %s" % best_thresh print "Classifier maximum f1 score = %s" % max_fscore return clf, best_thresh