def multivariate_gaussian(x_train, y_train, x_test, y_test, outdir): # Score with Multivariate Gaussian # Transform data using boxcox transform, and fit multivariate gaussians. x_train_boxcox, x_test_boxcox = mv.transform_features(x_train, x_test) rv_pos, rv_neg = mv.fit_gaussians(x_train_boxcox, y_train) # Compute melodiness scores on train and test set m_train, m_test = mv.compute_all_melodiness(x_train_boxcox, x_test_boxcox, rv_pos, rv_neg) # Compute various metrics based on melodiness scores. melodiness_scores = mv.melodiness_metrics(m_train, m_test, y_train, y_test) best_thresh, max_fscore, thresh_plot_data = \ eu.get_best_threshold(y_test, m_test) # THIS SHOULD PROBABLY BE VALIDATION NUMBERS... # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(), # columns=['recall', 'precision', # 'thresh', 'f1']) # fpath = os.path.join(outdir, 'thresh_plot_data.csv') # thresh_plot_data.to_csv(fpath) melodiness_scores = pd.DataFrame.from_dict(melodiness_scores) fpath = os.path.join(outdir, 'melodiness_scores.csv') melodiness_scores.to_csv(fpath) print "Melodiness best thresh = %s" % best_thresh print "Melodiness max f1 score = %s" % max_fscore print "overall melodiness scores:" print melodiness_scores
def classifier(x_train, y_train, x_valid, y_valid, x_test, y_test, outdir): """ Train Classifier """ # Cross Validation best_depth, _, cv_plot_data = cu.cross_val_sweep(x_train, y_train) print "Classifier best depth = %s" % best_depth cv_plot_data = pd.DataFrame(np.array(cv_plot_data).transpose(), columns=['max depth', 'accuracy', 'std']) fpath = os.path.join(outdir, 'cv_plot_data.csv') cv_plot_data.to_csv(fpath) # Training clf = cu.train_clf(x_train, y_train, best_depth) # Predict and Score p_train, p_valid, p_test = cu.clf_predictions(x_train, x_valid, x_test, clf) clf_scores = cu.clf_metrics(p_train, p_test, y_train, y_test) print "Classifier scores:" print clf_scores # Get threshold that maximizes F1 score best_thresh, max_fscore, thresh_plot_data = \ eu.get_best_threshold(y_valid, p_valid) # thresh_plot_data = pd.DataFrame(np.array(thresh_plot_data).transpose(), # columns=['recall', 'precision', # 'thresh', 'f1']) # fpath = os.path.join(outdir, 'thresh_plot_data.csv') # thresh_plot_data.to_csv(fpath) clf_scores = pd.DataFrame.from_dict(clf_scores) fpath = os.path.join(outdir, 'classifier_scores.csv') clf_scores.to_csv(fpath) clf_outdir = os.path.join(outdir, 'classifier') if not os.path.exists(clf_outdir): os.mkdir(clf_outdir) clf_fpath = os.path.join(clf_outdir, 'rf_clf.pkl') joblib.dump(clf, clf_fpath) print "Classifier best threshold = %s" % best_thresh print "Classifier maximum f1 score = %s" % max_fscore return clf, best_thresh
def train_and_classify(mdb_files, train, test, dset_contour_dict, dset_annot_dict): ''' - cross validate best depth of Randon Forest Classifier: cu.cross_val_sweep - classify all contours and get scikitlearn metrics: cu.clf_predictions - get threshold with best f-measure on validation dataset get_best_threshold(Y_valid, P_valid) on validation - classify test contours : contour_probs - melody decoding: gm.melody_from_cl labeling should be already done ''' random.shuffle(train) n_train = len(train) - (len(test) / 2) train_tracks = mdb_files[train[:n_train]] valid_tracks = mdb_files[train[n_train:]] test_tracks = mdb_files[test] train_contour_dict = {k: dset_contour_dict[k] for k in train_tracks} valid_contour_dict = {k: dset_contour_dict[k] for k in valid_tracks} test_contour_dict = {k: dset_contour_dict[k] for k in test_tracks} train_annot_dict = {k: dset_annot_dict[k] for k in train_tracks} valid_annot_dict = {k: dset_annot_dict[k] for k in valid_tracks} test_annot_dict = {k: dset_annot_dict[k] for k in test_tracks} reload(eu) partial_olap_stats, zero_olap_stats = eu.olap_stats(train_contour_dict) print 'overlapped stats on train data...' print partial_olap_stats len(train_contour_dict) reload(cc) anyContourDataFrame = dset_contour_dict[dset_contour_dict.keys()[0]] #### CONVERT PANDAS DATA to DATA for scikit Learn feats, idxStartFeatures, idxEndFeatures = getFeatureInfo( anyContourDataFrame) print 'idxStartFeatures' print idxStartFeatures print 'idxEndFeatures' print idxEndFeatures X_train, Y_train = cc.pd_to_sklearn(train_contour_dict, idxStartFeatures, idxEndFeatures) from numpy import inf idx = X_train == -inf X_train[idx] = 0 X_train = np.nan_to_num(X_train) X_valid, Y_valid = cc.pd_to_sklearn(valid_contour_dict, idxStartFeatures, idxEndFeatures) X_valid = np.nan_to_num(X_valid) X_test, Y_test = cc.pd_to_sklearn(test_contour_dict, idxStartFeatures, idxEndFeatures) X_test = np.nan_to_num(X_test) np.max(X_train, 0) ##################### cross-val of best depth of RFC reload(cu) best_depth, max_cv_accuracy, plot_dat = cu.cross_val_sweep(X_train, Y_train, plot=False) print "best depth is {}".format(best_depth) print "max_cv_accuracy is {}".format(max_cv_accuracy) df = pd.DataFrame(np.array(plot_dat).transpose(), columns=['max depth', 'accuracy', 'std']) ##################### 3.2 TRAIN and CLASSIFY clf = cu.train_clf(X_train, Y_train, best_depth) reload(cu) P_train, P_valid, P_test = cu.clf_predictions(X_train, X_valid, X_test, clf) clf_scores = cu.clf_metrics(P_train, P_test, Y_train, Y_test) print clf_scores['test'] #### get threshold with best f-measure on validation dataset reload(eu) best_thresh, max_fscore, plot_data = eu.get_best_threshold( Y_valid, P_valid) max_fscore = 0.0 print "best threshold = %s" % best_thresh print "maximum achieved f score = %s" % max_fscore # classify and add the melody probability for each contour as a field in the dict for key in test_contour_dict.keys(): test_contour_dict[key] = eu.contour_probs(clf, test_contour_dict[key], idxStartFeatures, idxEndFeatures) ################### 3.3. Melody decoding. ##### viterbi decoding reload(gm) mel_output_dict = {} for i, key in enumerate(test_contour_dict.keys()): print key mel_output_dict[key] = gm.melody_from_clf(test_contour_dict[key], prob_thresh=best_thresh) # mel_output_dict[key] = contours_to_vocal(test_contour_dict[key], prob_thresh=best_thresh) return mel_output_dict, test_annot_dict, clf, feats
print best_depth print max_cv_accuracy df = pd.DataFrame(np.array(plot_dat).transpose(), columns=['max depth', 'accuracy', 'std']) clf = cu.train_clf(X_train, Y_train, best_depth) reload(cu) P_train, P_valid, P_test = cu.clf_predictions(X_train, X_valid, X_test, clf) clf_scores = cu.clf_metrics(P_train, P_test, Y_train, Y_test) print clf_scores['test'] reload(eu) best_thresh, max_fscore, plot_data = eu.get_best_threshold( Y_valid, P_valid) print "besth threshold = %s" % best_thresh print "maximum achieved f score = %s" % max_fscore for key in test_contour_dict.keys(): test_contour_dict[key] = eu.contour_probs(clf, test_contour_dict[key], idxStartFeatures, idxEndFeatures) reload(gm) mel_output_dict = {} for i, key in enumerate(test_contour_dict.keys()): print key mel_output_dict[key] = gm.melody_from_clf(test_contour_dict[key], prob_thresh=best_thresh)