示例#1
0
def train_svm(paths,
              mid_window,
              mid_step,
              short_window,
              short_step,
              model_name,
              compute_beat=False,
              train_percentage=0.90):

    # STEP A: Feature Extraction:
    [features, class_names,
     _] = aF.multiple_directory_feature_extraction(paths,
                                                   mid_window,
                                                   mid_step,
                                                   short_window,
                                                   short_step,
                                                   compute_beat=compute_beat)

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    write_modearff_file(model_name, features, class_names, feature_names)
    # STEP B: classifier Evaluation and Parameter Selection and get optimal classifeir parameter::
    #
    temp_features = []
    for feat in features:
        temp = []
        for i in range(feat.shape[0]):
            temp_fv = feat[i, :]
            if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()):
                temp.append(temp_fv.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        temp_features.append(np.array(temp))
    features = temp_features

    best_param = aT.evaluate_classifier(features, class_names, 100, "svm",
                                        step_params.container.get_svm(), 0,
                                        train_percentage)

    print("Selected params: {0:.5f}".format(best_param))

    features_norm, mean, std = normalize_features(features)
    mean = mean.tolist()
    std = std.tolist()

    # STEP C: Save the classifier to file
    classifier = aT.train_svm(features_norm, best_param)
    write_mode_file(model_name, classifier, mean, std, class_names, mid_window,
                    mid_step, short_window, short_step, compute_beat)
示例#2
0
def extract_features(music_parent_directory):

    # List all the subdirectories

    subdirectories = os.listdir(music_parent_directory)

    for i in range(0, len(subdirectories)):

        if subdirectories[i] == '.DS_Store':
            subdirectories.pop(i)
            break

    subdirectories = [
        music_parent_directory + '/' + subdirectory
        for subdirectory in subdirectories
    ]

    return aF.multiple_directory_feature_extraction(subdirectories, 1.0, 1.0,
                                                    aT.shortTermWindow,
                                                    aT.shortTermStep, True)
示例#3
0
def feature_extraction_train_regression(folder_name,
                                        mid_window,
                                        mid_step,
                                        short_window,
                                        short_step,
                                        model_type,
                                        model_name,
                                        compute_beat=False):
    """
    This function is used as a wrapper to segment-based audio
    feature extraction and classifier training.
    ARGUMENTS:
        folder_name:        path of directory containing the WAV files
                         and Regression CSVs
        mt_win, mt_step:        mid-term window length and step
        st_win, st_step:        short-term window and step
        model_type:        "svm" or "knn" or "randomforest"
        model_name:        name of the model to be saved
    RETURNS:
        None. Resulting regression model along with the respective
        model parameters are saved on files.
    """
    # STEP A: Feature Extraction:
    features, _, filenames = \
        aF.multiple_directory_feature_extraction([folder_name], mid_window,
                                                 mid_step, short_window,
                                                 short_step,
                                                 compute_beat=compute_beat)
    features = features[0]
    filenames = [ntpath.basename(f) for f in filenames[0]]
    f_final = []

    # Read CSVs:
    csv_files = glob.glob(folder_name + os.sep + "*.csv")
    regression_labels = []
    regression_names = []
    f_final = []
    for c in csv_files:
        cur_regression_labels = []
        f_temp = []
        # open the csv file that contains the current target value's annotations
        with open(c, 'rt') as csvfile:
            csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for row in csv_reader:
                if len(row) == 2:
                    # ... and if the current filename exists
                    # in the list of filenames
                    if row[0] in filenames:
                        index = filenames.index(row[0])
                        cur_regression_labels.append(float(row[1]))
                        f_temp.append(features[index, :])
                    else:
                        print("Warning: {} not found "
                              "in list of files.".format(row[0]))
                else:
                    print(
                        "Warning: Row with unknown format in regression file")

        f_final.append(np.array(f_temp))
        # cur_regression_labels is the list of values
        # for the current regression problem
        regression_labels.append(np.array(cur_regression_labels))
        # regression task name
        regression_names.append(ntpath.basename(c).replace(".csv", ""))
        if len(features) == 0:
            print("ERROR: No data found in any input folder!")
            return

    # TODO: ARRF WRITE????
    # STEP B: classifier Evaluation and Parameter Selection:
    if model_type == "svm" or model_type == "svm_rbf":
        model_params = np.array(
            [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0])
    elif model_type == "randomforest":
        model_params = np.array([5, 10, 25, 50, 100])

    errors = []
    errors_base = []
    best_params = []

    for iRegression, r in enumerate(regression_names):
        # get optimal classifeir parameter:
        print("Regression task " + r)
        bestParam, error, berror = evaluate_regression(
            f_final[iRegression], regression_labels[iRegression], 100,
            model_type, model_params)
        errors.append(error)
        errors_base.append(berror)
        best_params.append(bestParam)
        print("Selected params: {0:.5f}".format(bestParam))

        features_norm, mean, std = normalize_features([f_final[iRegression]])

        # STEP C: Save the model to file
        if model_type == "svm":
            classifier, _ = train_svm_regression(
                features_norm[0], regression_labels[iRegression], bestParam)
        if model_type == "svm_rbf":
            classifier, _ = train_svm_regression(
                features_norm[0],
                regression_labels[iRegression],
                bestParam,
                kernel='rbf')
        if model_type == "randomforest":
            classifier, _ = train_random_forest_regression(
                features_norm[0], regression_labels[iRegression], bestParam)

        if model_type == "svm" or model_type == "svm_rbf" \
                or model_type == "randomforest":
            with open(model_name + "_" + r, 'wb') as fid:
                cPickle.dump(classifier, fid)
            save_path = model_name + "_" + r + "MEANS"
            save_parameters(save_path, mean, std, mid_window, mid_step,
                            short_window, short_step, compute_beat)

    return errors, errors_base, best_params
示例#4
0
def extract_features_and_train(paths,
                               mid_window,
                               mid_step,
                               short_window,
                               short_step,
                               classifier_type,
                               model_name,
                               compute_beat=False,
                               train_percentage=0.90):
    """
    This function is used as a wrapper to segment-based audio feature extraction
    and classifier training.
    ARGUMENTS:
        paths:                      list of paths of directories. Each directory
                                    contains a signle audio class whose samples
                                    are stored in seperate WAV files.
        mid_window, mid_step:       mid-term window length and step
        short_window, short_step:   short-term window and step
        classifier_type:            "svm" or "knn" or "randomforest" or
                                    "gradientboosting" or "extratrees"
        model_name:                 name of the model to be saved
    RETURNS:
        None. Resulting classifier along with the respective model
        parameters are saved on files.
    """

    # STEP A: Feature Extraction:
    features, class_names, _ = \
        aF.multiple_directory_feature_extraction(paths, mid_window, mid_step,
                                                 short_window, short_step,
                                                 compute_beat=compute_beat)

    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    write_train_data_arff(model_name, features, class_names, feature_names)

    for i, feat in enumerate(features):
        if len(feat) == 0:
            print("trainSVM_feature ERROR: " + paths[i] +
                  " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "knn":
        classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "extratrees":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])

    # get optimal classifeir parameter:
    temp_features = []
    for feat in features:
        temp = []
        for i in range(feat.shape[0]):
            temp_fv = feat[i, :]
            if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()):
                temp.append(temp_fv.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        temp_features.append(np.array(temp))
    features = temp_features

    best_param = evaluate_classifier(features, class_names, 100,
                                     classifier_type, classifier_par, 0,
                                     train_percentage)

    print("Selected params: {0:.5f}".format(best_param))

    features_norm, mean, std = normalize_features(features)
    mean = mean.tolist()
    std = std.tolist()

    # STEP C: Save the classifier to file
    if classifier_type == "svm":
        classifier = train_svm(features_norm, best_param)
    elif classifier_type == "svm_rbf":
        classifier = train_svm(features_norm, best_param, kernel='rbf')
    elif classifier_type == "randomforest":
        classifier = train_random_forest(features_norm, best_param)
    elif classifier_type == "gradientboosting":
        classifier = train_gradient_boosting(features_norm, best_param)
    elif classifier_type == "extratrees":
        classifier = train_extra_trees(features_norm, best_param)

    if classifier_type == "knn":
        feature_matrix, labels = features_to_matrix(features_norm)
        feature_matrix = feature_matrix.tolist()
        labels = labels.tolist()
        save_path = model_name
        save_parameters(save_path, feature_matrix, labels, mean, std,
                        class_names, best_param, mid_window, mid_step,
                        short_window, short_step, compute_beat)

    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
            classifier_type == "randomforest" or \
            classifier_type == "gradientboosting" or \
            classifier_type == "extratrees":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        save_path = model_name + "MEANS"
        save_parameters(save_path, mean, std, class_names, mid_window,
                        mid_step, short_window, short_step, compute_beat)
示例#5
0
def featureAndTrain(list_of_dirs,
                    mt_win,
                    mt_step,
                    st_win,
                    st_step,
                    classifier_type,
                    model_name,
                    compute_beat=False,
                    perTrain=0.90):
    """
    This function is used as a wrapper to segment-based audio feature extraction
    and classifier training.
    ARGUMENTS:
        list_of_dirs:        list of paths of directories. Each directory
                             contains a signle audio class whose samples
                             are stored in seperate WAV files.
        mt_win, mt_step:        mid-term window length and step
        st_win, st_step:        short-term window and step
        classifier_type:        "svm" or "knn" or "randomforest" or
                             "gradientboosting" or "extratrees"
        model_name:          name of the model to be saved
    RETURNS:
        None. Resulting classifier along with the respective model
        parameters are saved on files.
    """

    # STEP A: Feature Extraction:
    [features, classNames,
     _] = aF.multiple_directory_feature_extraction(list_of_dirs,
                                                   mt_win,
                                                   mt_step,
                                                   st_win,
                                                   st_step,
                                                   compute_beat=compute_beat)

    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    writeTrainDataToARFF(model_name, features, classNames, feature_names)

    for i, f in enumerate(features):
        if len(f) == 0:
            print("trainSVM_feature ERROR: " + list_of_dirs[i] +
                  " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "knn":
        classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "extratrees":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])

    # get optimal classifeir parameter:
    features2 = []
    for f in features:
        fTemp = []
        for i in range(f.shape[0]):
            temp = f[i, :]
            if (not np.isnan(temp).any()) and (not np.isinf(temp).any()):
                fTemp.append(temp.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        features2.append(np.array(fTemp))
    features = features2

    bestParam = evaluateclassifier(features, classNames, 100, classifier_type,
                                   classifier_par, 0, perTrain)

    print("Selected params: {0:.5f}".format(bestParam))

    C = len(classNames)
    [features_norm, MEAN, STD] = normalizeFeatures(features)
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = features_norm

    # STEP C: Save the classifier to file
    if classifier_type == "svm":
        classifier = trainSVM(featuresNew, bestParam)
    elif classifier_type == "svm_rbf":
        classifier = trainSVM_RBF(featuresNew, bestParam)
    elif classifier_type == "randomforest":
        classifier = trainRandomForest(featuresNew, bestParam)
    elif classifier_type == "gradientboosting":
        classifier = trainGradientBoosting(featuresNew, bestParam)
    elif classifier_type == "extratrees":
        classifier = trainExtraTrees(featuresNew, bestParam)

    if classifier_type == "knn":
        [X, Y] = listOfFeatures2Matrix(featuresNew)
        X = X.tolist()
        Y = Y.tolist()
        fo = open(model_name, "wb")
        cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(Y, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(bestParam, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
                    classifier_type == "randomforest" or \
                    classifier_type == "gradientboosting" or \
                    classifier_type == "extratrees":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        fo = open(model_name + "MEANS", "wb")
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
示例#6
0
def extract_features(paths,
                     mid_window,
                     mid_step,
                     short_window,
                     short_step,
                     compute_beat=False,
                     train_percentage=0.90):
    """
    This function is used as a wrapper to segment-based audio feature extraction
    and classifier training.
    ARGUMENTS:
        paths:                      list of paths of directories. Each directory
                                    contains a signle audio class whose samples
                                    are stored in seperate WAV files.
        mid_window, mid_step:       mid-term window length and step
        short_window, short_step:   short-term window and step
        classifier_type:            "svm" or "knn" or "randomforest" or
                                    "gradientboosting" or "extratrees"
        model_name:                 name of the model to be saved
    RETURNS:
        None. Resulting classifier along with the respective model
        parameters are saved on files.
    """

    # STEP A: Feature Extraction:
    features, class_names, _ = \
        aF.multiple_directory_feature_extraction(paths, mid_window, mid_step,
                                                 short_window, short_step,
                                                 compute_beat=compute_beat)

    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    # write_train_data_arff(model_name, features, class_names, feature_names)

    for i, feat in enumerate(features):
        if len(feat) == 0:
            print("trainSVM_feature ERROR: " + paths[i] +
                  " folder is empty or non-existing!")
            return

    features2 = []
    for f in features:
        fTemp = []
        for i in range(f.shape[0]):
            temp = f[i, :]
            if (not np.isnan(temp).any()) and (not np.isinf(temp).any()):
                fTemp.append(temp.tolist())
            else:
                print
                "NaN Found! Feature vector not used for training"
        features2.append(np.array(fTemp))
    features = features2
    [featuresNorm, MEAN,
     STD] = normalize_features(features)  # normalize features
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = featuresNorm
    return featuresNew
示例#7
0
def extract_features_and_train(paths,
                               mid_window,
                               mid_step,
                               short_window,
                               short_step,
                               classifier_type,
                               model_name,
                               compute_beat=False,
                               train_percentage=0.90,
                               dict_of_ids=None,
                               use_smote=False):
    """
    This function is used as a wrapper to segment-based audio feature extraction
    and classifier training.
    ARGUMENTS:
        paths:                      list of paths of directories. Each directory
                                    contains a signle audio class whose samples
                                    are stored in seperate WAV files.
        mid_window, mid_step:       mid-term window length and step
        short_window, short_step:   short-term window and step
        classifier_type:            "svm" or "knn" or "randomforest" or
                                    "gradientboosting" or "extratrees"
        model_name:                 name of the model to be saved
        dict_of_ids:                a dictionary which has as keys the full path of audio files and as values the respective group ids
    RETURNS:
        None. Resulting classifier along with the respective model
        parameters are saved on files.
    """

    # STEP A: Feature Extraction:
    features, class_names, file_names = \
        aF.multiple_directory_feature_extraction(paths, mid_window, mid_step,
                                                 short_window, short_step,
                                                 compute_beat=compute_beat)
    file_names = [item for sublist in file_names for item in sublist]
    if dict_of_ids:
        list_of_ids = [dict_of_ids[file] for file in file_names]
    else:
        list_of_ids = None
    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    for i, feat in enumerate(features):
        if len(feat) == 0:
            print("trainSVM_feature ERROR: " + paths[i] +
                  " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "knn":
        classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])
    elif classifier_type == "extratrees":
        classifier_par = np.array([10, 25, 50, 100, 200, 500])

    # get optimal classifier parameter:
    temp_features = []
    for feat in features:
        temp = []
        for i in range(feat.shape[0]):
            temp_fv = feat[i, :]
            if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()):
                temp.append(temp_fv.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        temp_features.append(np.array(temp))
    features = temp_features

    best_param = evaluate_classifier(features,
                                     class_names,
                                     classifier_type,
                                     classifier_par,
                                     1,
                                     list_of_ids,
                                     n_exp=-1,
                                     train_percentage=train_percentage,
                                     smote=use_smote)

    print("Selected params: {0:.5f}".format(best_param))

    # STEP C: Train and Save the classifier to file
    # Get featues in the X, y format:
    features, labels = features_to_matrix(features)
    # Apply smote if necessary:
    if use_smote:
        sm = SMOTE(random_state=2)
        features, labels = sm.fit_resample(features, labels)

    # Use mean/std standard feature scaling:
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    mean = scaler.mean_.tolist()
    std = scaler.scale_.tolist()

    # Then train the final classifier
    if classifier_type == "svm":
        classifier = train_svm(features, labels, best_param)
    elif classifier_type == "svm_rbf":
        classifier = train_svm(features, labels, best_param, kernel='rbf')
    elif classifier_type == "randomforest":
        classifier = train_random_forest(features, labels, best_param)
    elif classifier_type == "gradientboosting":
        classifier = train_gradient_boosting(features, labels, best_param)
    elif classifier_type == "extratrees":
        classifier = train_extra_trees(features, labels, best_param)

    # And save the model to a file, along with
    # - the scaling -mean/std- vectors)
    # - the feature extraction parameters
    if classifier_type == "knn":
        feature_matrix = features.tolist()
        labels = labels.tolist()
        save_path = model_name
        save_parameters(save_path, feature_matrix, labels, mean, std,
                        class_names, best_param, mid_window, mid_step,
                        short_window, short_step, compute_beat)

    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
            classifier_type == "randomforest" or \
            classifier_type == "gradientboosting" or \
            classifier_type == "extratrees":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        save_path = model_name + "MEANS"
        save_parameters(save_path, mean, std, class_names, mid_window,
                        mid_step, short_window, short_step, compute_beat)
示例#8
0
    values, labels = torch.max(test_result, 1)
    y_pred = labels.data.numpy()
    return f1_score(y_pred, test_labels)


from pyAudioAnalysis import MidTermFeatures as mt
from pyAudioAnalysis import audioTrainTest as aT
import numpy as np
import os

if os.path.isfile("features.npy"):
    with open('features.npy', 'rb') as f:
        X = np.load(f)
        y = np.load(f)
else:
    features, class_names, file_names = mt.multiple_directory_feature_extraction(
        ["audio/speech", "audio/noise"], 1, 1, 0.1, 0.1, False)
    X, y = aT.features_to_matrix(features)
    with open('features.npy', 'wb') as f:
        np.save(f, np.array(X))
        np.save(f, np.array(y))

dimensions = X.shape[1]

# Split to train/test
X_train = X[::2, :]
y_train = y[::2]
X_test = X[1::2, :]
y_test = y[1::2]

n_nodes = 256