def compute_labels(train_contour_dict, valid_contour_dict, \ test_contour_dict, olap_thresh): """ """ # Compute Labels using Overlap Threshold train_contour_dict, valid_contour_dict, test_contour_dict = \ eu.label_all_contours(train_contour_dict, valid_contour_dict, \ test_contour_dict, olap_thresh=olap_thresh) x_train, y_train = cc.pd_to_sklearn(train_contour_dict) x_valid, y_valid = cc.pd_to_sklearn(valid_contour_dict) x_test, y_test = cc.pd_to_sklearn(test_contour_dict) return x_train, y_train, x_valid, y_valid, x_test, y_test, test_contour_dict
def contour_probs(clf, contour_data, idxStartFeatures=0, idxEndFeatures=11): """ Compute classifier probabilities for contours. Parameters ---------- clf : scikit-learn classifier Binary classifier. contour_data : DataFrame DataFrame with contour information. Returns ------- contour_data : DataFrame DataFrame with contour information and predicted probabilities. """ contour_data['mel prob'] = -1 features, _ = cc.pd_to_sklearn(contour_data, idxStartFeatures, idxEndFeatures) probs = clf.predict_proba(features) mel_probs = [p[1] for p in probs] contour_data['mel prob'] = mel_probs return contour_data
def train_and_classify(mdb_files, train, test, dset_contour_dict, dset_annot_dict): ''' - cross validate best depth of Randon Forest Classifier: cu.cross_val_sweep - classify all contours and get scikitlearn metrics: cu.clf_predictions - get threshold with best f-measure on validation dataset get_best_threshold(Y_valid, P_valid) on validation - classify test contours : contour_probs - melody decoding: gm.melody_from_cl labeling should be already done ''' random.shuffle(train) n_train = len(train) - (len(test) / 2) train_tracks = mdb_files[train[:n_train]] valid_tracks = mdb_files[train[n_train:]] test_tracks = mdb_files[test] train_contour_dict = {k: dset_contour_dict[k] for k in train_tracks} valid_contour_dict = {k: dset_contour_dict[k] for k in valid_tracks} test_contour_dict = {k: dset_contour_dict[k] for k in test_tracks} train_annot_dict = {k: dset_annot_dict[k] for k in train_tracks} valid_annot_dict = {k: dset_annot_dict[k] for k in valid_tracks} test_annot_dict = {k: dset_annot_dict[k] for k in test_tracks} reload(eu) partial_olap_stats, zero_olap_stats = eu.olap_stats(train_contour_dict) print 'overlapped stats on train data...' print partial_olap_stats len(train_contour_dict) reload(cc) anyContourDataFrame = dset_contour_dict[dset_contour_dict.keys()[0]] #### CONVERT PANDAS DATA to DATA for scikit Learn feats, idxStartFeatures, idxEndFeatures = getFeatureInfo( anyContourDataFrame) print 'idxStartFeatures' print idxStartFeatures print 'idxEndFeatures' print idxEndFeatures X_train, Y_train = cc.pd_to_sklearn(train_contour_dict, idxStartFeatures, idxEndFeatures) from numpy import inf idx = X_train == -inf X_train[idx] = 0 X_train = np.nan_to_num(X_train) X_valid, Y_valid = cc.pd_to_sklearn(valid_contour_dict, idxStartFeatures, idxEndFeatures) X_valid = np.nan_to_num(X_valid) X_test, Y_test = cc.pd_to_sklearn(test_contour_dict, idxStartFeatures, idxEndFeatures) X_test = np.nan_to_num(X_test) np.max(X_train, 0) ##################### cross-val of best depth of RFC reload(cu) best_depth, max_cv_accuracy, plot_dat = cu.cross_val_sweep(X_train, Y_train, plot=False) print "best depth is {}".format(best_depth) print "max_cv_accuracy is {}".format(max_cv_accuracy) df = pd.DataFrame(np.array(plot_dat).transpose(), columns=['max depth', 'accuracy', 'std']) ##################### 3.2 TRAIN and CLASSIFY clf = cu.train_clf(X_train, Y_train, best_depth) reload(cu) P_train, P_valid, P_test = cu.clf_predictions(X_train, X_valid, X_test, clf) clf_scores = cu.clf_metrics(P_train, P_test, Y_train, Y_test) print clf_scores['test'] #### get threshold with best f-measure on validation dataset reload(eu) best_thresh, max_fscore, plot_data = eu.get_best_threshold( Y_valid, P_valid) max_fscore = 0.0 print "best threshold = %s" % best_thresh print "maximum achieved f score = %s" % max_fscore # classify and add the melody probability for each contour as a field in the dict for key in test_contour_dict.keys(): test_contour_dict[key] = eu.contour_probs(clf, test_contour_dict[key], idxStartFeatures, idxEndFeatures) ################### 3.3. Melody decoding. ##### viterbi decoding reload(gm) mel_output_dict = {} for i, key in enumerate(test_contour_dict.keys()): print key mel_output_dict[key] = gm.melody_from_clf(test_contour_dict[key], prob_thresh=best_thresh) # mel_output_dict[key] = contours_to_vocal(test_contour_dict[key], prob_thresh=best_thresh) return mel_output_dict, test_annot_dict, clf, feats
reload(eu) olap_stats, zero_olap_stats = eu.olap_stats(train_contour_dict) OLAP_THRESH = 0.5 train_contour_dict, valid_contour_dict, test_contour_dict = \ eu.label_all_contours(train_contour_dict, valid_contour_dict, \ test_contour_dict, olap_thresh=OLAP_THRESH) len(train_contour_dict) reload(cc) anyContourDataFrame = dset_contour_dict[dset_contour_dict.keys()[0]] feats, idxStartFeatures, idxEndFeatures = getFeatureInfo( anyContourDataFrame) X_train, Y_train = cc.pd_to_sklearn(train_contour_dict, idxStartFeatures, idxEndFeatures) X_valid, Y_valid = cc.pd_to_sklearn(valid_contour_dict, idxStartFeatures, idxEndFeatures) X_test, Y_test = cc.pd_to_sklearn(test_contour_dict, idxStartFeatures, idxEndFeatures) np.max(X_train, 0) # x,y = cc.pd_to_sklearn(train_contour_dict['AClassicEducation_NightOwl']) # train_contour_dict['AClassicEducation_NightOwl'] # contour_data = train_contour_dict['AClassicEducation_NightOwl'] # x[68] # train_contour_dict['AClassicEducation_NightOwl'].loc[68,:] # # X_train_boxcox, X_test_boxcox = mv.transform_features(X_train, X_test) # rv_pos, rv_neg = mv.fit_gaussians(X_train_boxcox, Y_train) #