Пример #1
0
def compute_labels(train_contour_dict, valid_contour_dict, \
                   test_contour_dict, olap_thresh):
    """
    """
    # Compute Labels using Overlap Threshold
    train_contour_dict, valid_contour_dict, test_contour_dict = \
        eu.label_all_contours(train_contour_dict, valid_contour_dict, \
                              test_contour_dict, olap_thresh=olap_thresh)

    x_train, y_train = cc.pd_to_sklearn(train_contour_dict)
    x_valid, y_valid = cc.pd_to_sklearn(valid_contour_dict)
    x_test, y_test = cc.pd_to_sklearn(test_contour_dict)

    return x_train, y_train, x_valid, y_valid, x_test, y_test, test_contour_dict
def contour_probs(clf, contour_data, idxStartFeatures=0, idxEndFeatures=11):
    """ Compute classifier probabilities for contours.

    Parameters
    ----------
    clf : scikit-learn classifier
        Binary classifier.
    contour_data : DataFrame
        DataFrame with contour information.

    Returns
    -------
    contour_data : DataFrame
        DataFrame with contour information and predicted probabilities.
    """
    contour_data['mel prob'] = -1
    features, _ = cc.pd_to_sklearn(contour_data, idxStartFeatures,
                                   idxEndFeatures)
    probs = clf.predict_proba(features)
    mel_probs = [p[1] for p in probs]
    contour_data['mel prob'] = mel_probs
    return contour_data
Пример #3
0
def train_and_classify(mdb_files, train, test, dset_contour_dict,
                       dset_annot_dict):
    '''
            
            - cross validate best depth of Randon Forest Classifier: cu.cross_val_sweep

            - classify all contours and get scikitlearn metrics: cu.clf_predictions

            - get threshold with best f-measure on validation dataset get_best_threshold(Y_valid, P_valid) on validation

            - classify test contours : contour_probs

            - melody decoding: gm.melody_from_cl
            labeling should be already done
            '''
    random.shuffle(train)
    n_train = len(train) - (len(test) / 2)
    train_tracks = mdb_files[train[:n_train]]
    valid_tracks = mdb_files[train[n_train:]]
    test_tracks = mdb_files[test]

    train_contour_dict = {k: dset_contour_dict[k] for k in train_tracks}
    valid_contour_dict = {k: dset_contour_dict[k] for k in valid_tracks}
    test_contour_dict = {k: dset_contour_dict[k] for k in test_tracks}

    train_annot_dict = {k: dset_annot_dict[k] for k in train_tracks}
    valid_annot_dict = {k: dset_annot_dict[k] for k in valid_tracks}
    test_annot_dict = {k: dset_annot_dict[k] for k in test_tracks}

    reload(eu)
    partial_olap_stats, zero_olap_stats = eu.olap_stats(train_contour_dict)
    print 'overlapped stats on train data...'
    print partial_olap_stats

    len(train_contour_dict)

    reload(cc)

    anyContourDataFrame = dset_contour_dict[dset_contour_dict.keys()[0]]

    #### CONVERT PANDAS DATA to DATA for scikit Learn
    feats, idxStartFeatures, idxEndFeatures = getFeatureInfo(
        anyContourDataFrame)
    print 'idxStartFeatures'
    print idxStartFeatures
    print 'idxEndFeatures'
    print idxEndFeatures

    X_train, Y_train = cc.pd_to_sklearn(train_contour_dict, idxStartFeatures,
                                        idxEndFeatures)
    from numpy import inf
    idx = X_train == -inf
    X_train[idx] = 0
    X_train = np.nan_to_num(X_train)
    X_valid, Y_valid = cc.pd_to_sklearn(valid_contour_dict, idxStartFeatures,
                                        idxEndFeatures)
    X_valid = np.nan_to_num(X_valid)
    X_test, Y_test = cc.pd_to_sklearn(test_contour_dict, idxStartFeatures,
                                      idxEndFeatures)
    X_test = np.nan_to_num(X_test)
    np.max(X_train, 0)

    #####################  cross-val of best depth of RFC
    reload(cu)
    best_depth, max_cv_accuracy, plot_dat = cu.cross_val_sweep(X_train,
                                                               Y_train,
                                                               plot=False)
    print "best depth is {}".format(best_depth)
    print "max_cv_accuracy is {}".format(max_cv_accuracy)

    df = pd.DataFrame(np.array(plot_dat).transpose(),
                      columns=['max depth', 'accuracy', 'std'])

    ##################### 3.2 TRAIN and CLASSIFY
    clf = cu.train_clf(X_train, Y_train, best_depth)

    reload(cu)
    P_train, P_valid, P_test = cu.clf_predictions(X_train, X_valid, X_test,
                                                  clf)
    clf_scores = cu.clf_metrics(P_train, P_test, Y_train, Y_test)
    print clf_scores['test']

    #### get threshold with best f-measure on validation dataset
    reload(eu)
    best_thresh, max_fscore, plot_data = eu.get_best_threshold(
        Y_valid, P_valid)
    max_fscore = 0.0
    print "best threshold = %s" % best_thresh
    print "maximum achieved f score = %s" % max_fscore

    # classify and add the melody probability for each contour as a field in the dict
    for key in test_contour_dict.keys():
        test_contour_dict[key] = eu.contour_probs(clf, test_contour_dict[key],
                                                  idxStartFeatures,
                                                  idxEndFeatures)

    ################### 3.3. Melody decoding.
    #####  viterbi decoding
    reload(gm)
    mel_output_dict = {}
    for i, key in enumerate(test_contour_dict.keys()):
        print key
        mel_output_dict[key] = gm.melody_from_clf(test_contour_dict[key],
                                                  prob_thresh=best_thresh)

#             mel_output_dict[key] = contours_to_vocal(test_contour_dict[key], prob_thresh=best_thresh)
    return mel_output_dict, test_annot_dict, clf, feats
        reload(eu)
        olap_stats, zero_olap_stats = eu.olap_stats(train_contour_dict)
        OLAP_THRESH = 0.5
        train_contour_dict, valid_contour_dict, test_contour_dict = \
            eu.label_all_contours(train_contour_dict, valid_contour_dict, \
                                  test_contour_dict, olap_thresh=OLAP_THRESH)
        len(train_contour_dict)

        reload(cc)

        anyContourDataFrame = dset_contour_dict[dset_contour_dict.keys()[0]]

        feats, idxStartFeatures, idxEndFeatures = getFeatureInfo(
            anyContourDataFrame)

        X_train, Y_train = cc.pd_to_sklearn(train_contour_dict,
                                            idxStartFeatures, idxEndFeatures)
        X_valid, Y_valid = cc.pd_to_sklearn(valid_contour_dict,
                                            idxStartFeatures, idxEndFeatures)
        X_test, Y_test = cc.pd_to_sklearn(test_contour_dict, idxStartFeatures,
                                          idxEndFeatures)
        np.max(X_train, 0)

        # x,y = cc.pd_to_sklearn(train_contour_dict['AClassicEducation_NightOwl'])
        # train_contour_dict['AClassicEducation_NightOwl']
        # contour_data = train_contour_dict['AClassicEducation_NightOwl']
        # x[68]
        # train_contour_dict['AClassicEducation_NightOwl'].loc[68,:]
        #
        # X_train_boxcox, X_test_boxcox = mv.transform_features(X_train, X_test)
        # rv_pos, rv_neg = mv.fit_gaussians(X_train_boxcox, Y_train)
        #