def parseAllFeatures(self, indices, filenames):
        returnList = []
        returnLabels = []

        tot = np.zeros(len(indices))
        num = 0
        for el in filenames:
            classname = el.split('/')[-1].strip()
            # print (el, classname)
            try:
                [Fs, x] = audioBasicIO.readAudioFile(el)
            except ValueError:
                continue
            F = None
            if len(x.shape) == 1:
                F = audioFeatureExtraction.stFeatureExtraction(
                    x, Fs, 0.050 * Fs, 0.025 * Fs)
            else:
                F = audioFeatureExtraction.stFeatureExtraction(
                    x[:, 0], Fs, 0.050 * Fs, 0.025 * Fs)

            tot += np.mean(F[indices, :], axis=1)
            num += 1
            returnList.append(F[indices, :].T)
            if classname[0] == 'a':
                returnLabels.append(0)
            elif classname[0] == 'd':
                returnLabels.append(1)
            elif classname[0] == 'f':
                returnLabels.append(2)
            elif classname[0] == 'h':
                returnLabels.append(3)
            elif classname[0] == 'n':
                returnLabels.append(4)
            elif classname[0:2] == 'sa':
                returnLabels.append(5)
            else:
                returnLabels.append(6)
        returnListLength = len(returnList)

        random.seed(13921)

        shuffledIndices = random.sample(range(returnListLength),
                                        returnListLength)
        shuffledReturnList = [returnList[i] for i in shuffledIndices]
        shuffledReturnLabels = [returnLabels[i] for i in shuffledIndices]

        return shuffledReturnList, shuffledReturnLabels
    def extract_dataset(self, data, nb_samples, dataset, save=True):
        f_global = []

        i = 0
        for (x, Fs) in data:
            # 34D short-term feature
            f = audioFeatureExtraction.stFeatureExtraction(x, Fs, globalvars.frame_size * Fs, globalvars.step * Fs)

            # Harmonic ratio and pitch, 2D
            hr_pitch = audioFeatureExtraction.stFeatureSpeed(x, Fs, globalvars.frame_size * Fs, globalvars.step * Fs)
            f = np.append(f, hr_pitch.transpose(), axis=0)

            # Z-normalized
            f = stats.zscore(f, axis=0)

            f = f.transpose()

            f_global.append(f)

            sys.stdout.write("\033[F")
            i = i + 1
            print("Extracting features " + str(i) + '/' + str(nb_samples) + " from data set...")

        f_global = sequence.pad_sequences(f_global,
                                          maxlen=globalvars.max_len,
                                          dtype='float32',
                                          padding='post',
                                          value=globalvars.masking_value)

        if save:
            print("Saving features to file...")
            cPickle.dump(f_global, open(dataset + '_features.p', 'wb'))

        return f_global
示例#3
0
def extract(x, sr=16000):
    f_global = []

    # 34D short-term feature
    f = audioFeatureExtraction.stFeatureExtraction(x, sr,
                                                   globalvars.frame_size * sr,
                                                   globalvars.step * sr)

    # for pyAudioAnalysis which support python3
    if type(f) is tuple:
        f = f[0]

    # Harmonic ratio and pitch, 2D
    hr_pitch = audioFeatureExtraction.stFeatureSpeed(
        x, sr, globalvars.frame_size * sr, globalvars.step * sr)
    f = np.append(f, hr_pitch.transpose(), axis=0)

    # Z-normalized
    f = stats.zscore(f, axis=0)

    f = f.transpose()

    f_global.append(f)

    f_global = sequence.pad_sequences(f_global,
                                      maxlen=globalvars.max_len,
                                      dtype='float32',
                                      padding='post',
                                      value=globalvars.masking_value)

    return f_global
def extract_features(path_file, frame_size=25e-3, frame_stride=10e-3):
    """Function to combine logmel and frame level ST features
        extracted using pyAudioAnalysis Library
        Output: 40+22 = 62 dim logmel+ST features
    """
    [sample_rate, signal] = audioBasicIO.readAudioFile(path_file)
    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
    # signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    st_features = audioFeatureExtraction.stFeatureExtraction(
        signal, sample_rate, frame_length, frame_step)
    filter_banks = extract_logmel(path_file,
                                  frame_size=25e-3,
                                  frame_stride=10e-3,
                                  normalize=False)

    st_features = np.transpose(
        st_features)  # transpose to make frame_count as x-axis
    st_features = np.delete(st_features, np.s_[8:21],
                            axis=1)  # delete the MFCCs
    if st_features.shape[0] - filter_banks.shape[0] == 1:
        st_features = st_features[:-1, :]
    # print (st_features.shape[0], filter_banks.shape[0])
    features = np.c_[st_features, filter_banks]
    features -= (np.mean(features, axis=0) + 1e-8)
    return features
示例#5
0
def get_st_features(signal, rate, window_step=0.025, window_length=0.05):
    """Computes all 34 features for each window in a given signal
    Parameters
    ----------
    signal : numpy array
        All sample points for the audio signal
        Can be any type of number
    rate : int
        Sample rate of the audio signal, in Hz
    window_step : float
        Time step between each successive window, in seconds
        Default: 0.025 (25 ms)
    window_length : float
        Length of each window, in seconds
        Should generally be greater than windowStep to allow for overlap between frames
        Default: 0.05 (50 ms)
    Returns
    ----------
    features : numpy array
        NumPy array of size (number of windows) * 34
        Each row in mfcc_features contains all the features for a single frame
    feature_names : [str]
        Names of each feature located at specified index"""

    sample_step = int(rate * window_step)
    sample_length = int(rate * window_length)

    (features, feature_names) = audioFeatureExtraction.stFeatureExtraction(
        signal, rate, sample_length, sample_step)

    return features, feature_names
示例#6
0
文件: run.py 项目: hyunwooj/unm-cs429
def load_validation_set():
    """
    Output
        a tuple of features: (fft features, mfcc features, mean-std features)
    Description
        extracts three types of features from validation set.
    """
    ffts = dict()
    mfccs = dict()
    mean_stds = dict()

    for i in validation_ids:
        path = './validation/validation.{i}.wav'.format(i=i)

        _, X = read_wav(path)

        # FFT
        fft = np.array(abs(sp.fft(X)[:1000]))
        ffts.update({i: fft})

        # MFCC
        ceps, mspec, spec = mfcc(X)
        num_ceps = len(ceps)
        x = np.mean(ceps[int(num_ceps*1/10):int(num_ceps*9/10)], axis=0)
        mfccs.update({i: x})


        # Mean-Std
        [Fs, x] = audioBasicIO.readAudioFile(path);
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs);
        mean_std = []
        for f in F:
            mean_std.extend([f.mean(), f.std()])
        mean_stds.update({i: np.array(mean_std)})
    return (ffts, mfccs, mean_stds)
 def pitchProc(self):
     print('pitchProc = ' + (self.fname))
     [Fs, x] = audioBasicIO.readAudioFile(self.fname)
     info = audioFeatureExtraction.stFeatureExtraction(
         x, Fs, 0.050 * Fs, 0.025 * Fs)
     print(len(x))
     return info[0][1]
示例#8
0
def classifyEmotion(filePath):
    print("[INFO] Loading sound file")
    [Fs, x] = audioBasicIO.readAudioFile(filePath)
    x = audioBasicIO.stereo2mono(x)
    features, _ = audioFeatureExtraction.stFeatureExtraction(
        x, Fs, FRAME_SIZE * Fs, FRAME_SIZE / 2 * Fs)
    inputArray = np.expand_dims(features, axis=3)

    first_layer = model.get_layer(index=0)
    required_input_shape = first_layer.get_config()['batch_input_shape'][1:]

    # Adjust input to match required shape
    if required_input_shape[1] > inputArray.shape[1]:
        zerosArray = np.zeros(
            (required_input_shape[0],
             required_input_shape[1] - inputArray.shape[1], 1),
            dtype=inputArray.dtype)
        inputArray = np.concatenate((inputArray, zerosArray), axis=1)
    else:
        inputArray = inputArray[:, :required_input_shape[1], :]

    print("[INFO] classifying sound...")
    proba = model.predict(np.expand_dims(inputArray, axis=0))[0]
    idx = np.argmax(proba)
    label = lb.classes_[idx]

    label_with_predictions = {}
    for i in range(len(proba)):
        label_with_predictions[lb.classes_[i]] = proba[i]

    print("[INFO] Probabilities:", label_with_predictions)

    print("[INFO] Prediction {}".format(label))
    return label
示例#9
0
        def extract_features3(self,Fs,x):    
            x = audioBasicIO.stereo2mono(x)  # necessary conversion for pyaudio analysis
            #print len(x)

            # they must be 24k samples
            #coef = int(np.floor(len(x)/48000))

            #x = x[range(0,len(x),6)]
            #print len(x)
            # Fs=16000

            features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            if len(features) == 0:
                features = np.zeros((34, 2))

            features_mean = np.mean(features, axis=1)
            features_std = np.std(features, axis=1)
            features_kurtosis = kurtosis(features, axis=1)
            features_skew = skew(features, axis=1)

            vec4moments = np.append(np.append(np.append(features_mean, features_std), features_kurtosis), features_skew)

            result = np.asarray(vec4moments).reshape(len(vec4moments), -1).transpose()
            #print(np.shape(result))
            # features_complete = np.append(features_complete, features, axis=0)
            return result#vec4moments  # _complete
示例#10
0
def extract_features(dataset):
    data = dataset.data
    nb_samples = len(dataset.targets)
    frame_size = dataset.frame_size
    step = dataset.step
    f_global = []

    i = 0
    for (x, Fs) in data:
        # 34D short-term feature
        f = audioFeatureExtraction.stFeatureExtraction(x, Fs, frame_size * Fs, step * Fs)

        # for pyAudioAnalysis which support python3
        if type(f) is tuple:
            f = f[0]

        # Harmonic ratio and pitch, 2D
        hr_pitch = audioFeatureExtraction.stFeatureSpeed(x, Fs, frame_size * Fs, step * Fs)
        f = np.append(f, hr_pitch.transpose(), axis=0)

        # Z-normalized
        f = stats.zscore(f, axis=0)

        f = f.transpose()
        f = np.mean(f, axis=0)

        f_global.append(f)

        sys.stdout.write("\033[F")
        i = i + 1
        print("\t Extracting features " + str(i) + '/' + str(nb_samples) + " from data set...")

    return f_global
示例#11
0
def extract_features(path):
    print 'extract feature of test set'
    test_pkl = 'test34.pkl'
    if os.path.isfile(test_pkl):
        [test_set, list_filenames] = cPickle.load(open(test_pkl, 'rb'))
        return test_set, list_filenames
    test_set = []
    list_filenames = sorted(os.listdir(path))
    for filename in list_filenames:
        path_to_file = os.path.join(path, filename)
        [rate, sig] = audioBasicIO.readAudioFile(path_to_file)
        if (rate == -1 and sig == -1):
            #convert to wav
            #command = "ffmpeg -i {}".format(path_to_file)
            extension = os.path.splitext(filename)[-1]
            new_file = path_to_file.replace(extension, '.wav')
            command = "ffmpeg -i {} {}".format(path_to_file, new_file)
            os.system(command)
            [rate, sig] = audioBasicIO.readAudioFile(new_file)
            os.system('rm {}'.format(
                path_to_file))  #remove old file not in *.wav format
        if sig.ndim >= 2:  #merge multichannels into mono channel
            sig = np.mean(sig, axis=1)
        features = audioFeatureExtraction.stFeatureExtraction(
            sig, rate, win * rate, step * rate)
        features = features.reshape((features.shape[1], -1))
        test_set.append(features)
    cPickle.dump([test_set, list_filenames], open(test_pkl, 'wb'), -1)
    return test_set, list_filenames
示例#12
0
 def pitchProc2(self, results_dict):
     print("pitchProc2")
     [Fs, x] = audioBasicIO.readAudioFile(self.fname)
     info = audioFeatureExtraction.stFeatureExtraction(
         x, Fs, 0.050 * Fs, 0.025 * Fs)
     results_dict["pitch"] = info[0][1]
     return info[0][1]
示例#13
0
def newFeatures(inFile, isVAD):
    # read audio data from file
    #[Fs, x] = aIO.readAudioFile(inFile)
    x, Fs = librosa.load(inFile, sr=16000)  #sf.read(inFile)

    # window and overlap size
    win = int(0.025 * Fs)
    step = int(0.010 * Fs)

    # get short-time features
    Feats = aFE.stFeatureExtraction(x, Fs, win, step)

    if isVAD:
        energy = Feats[1]
        thv = energy.mean() * 0.1
        i_speechs = np.where(energy > thv)[0]
        Feats = Feats[:, i_speechs]

    # saveFeats(Feats)
    Feats = np.transpose(Feats[8:21, :])

    newFeat = []
    for row in range(len(Feats)):
        newFeat.append(Feats[row, :])

    return newFeat
示例#14
0
def predict_genre(song_path):
    optimal_training_features = load(OPTIMAL_TRAINING_FEATURES_DIR)
    training_data = load(OPTIMAL_TRAINING_FEATURES_DATA_DIR).item()

    X_train = training_data['X_train']
    y_train = training_data['y_train']
    x_test = []

    Fs, x = audioBasicIO.readAudioFile(song_path)
    F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs)

    features_to_extract = sorted(optimal_training_features, key=itemgetter('feature_id'))
    for feature in features_to_extract:
        feature_index = feature['feature_id'] - 1
        feature_prop = feature['feature_prop']
        feature_data = None
        if feature_prop == 'min':
            feature_data = min(F[feature_index, :])
        elif feature_prop == 'mean':
            feature_data = mean(F[feature_index, :])
        elif feature_prop == 'max':
            feature_data = max(F[feature_index, :])
        x_test.append(feature_data)

    X_train_scale, x_test_scale = scale_data_multiple(X_train, x_test)

    prediction = k_nearest_neighbor.k_nearest_neighbor(X_train_scale, y_train, x_test_scale, training_data['k'])
    return prediction
def getClassIDsToClipFramesMFCCs(classIDsToClipsMap):
    classToMFCCsOfClips = {}
    for classID in classIDsToClipsMap:
        classToMFCCsOfClips[classID] = []
        for singleEventAudioClip in classIDsToClipsMap[classID]:
            # this returns a matrix that looks like:
            """
                  zcr   | energy | energy_entropy | .... | mfcc_0 | mfcc_1 | ... | ... |
            win0  value   value
            ----
            win1  value
            ----
            .
            .
            .
            """
            # so featuresMatrix[0] = a list of the zcr for each window, and featuresMatrix[0][0]
            # would be the zcr for the very first window
            # we care about featuresMatrix[8:20] for the 13 MFCCs

            featuresMatrix, featureNames = fe.stFeatureExtraction(
                singleEventAudioClip, sf, window_size, step_size)
            mfccsForEachWindow = []
            # for each window, loop through indicies 8-20 to get all of its mfccs into one list, and compile all of
            # that into a bigger list
            for i in range(0, len(featuresMatrix[8])):
                mfccsForOneWindow = []
                for j in range(8, 21):
                    mfccsForOneWindow.append(featuresMatrix[j][i])
                mfccsForEachWindow.append(mfccsForOneWindow)

            # have list that looks like [ [mfcc0, mfcc1, mfcc2...], [mfcc0...], ... ] for each window in one clip
            classToMFCCsOfClips[classID].append(mfccsForEachWindow)

    return classToMFCCsOfClips
def extractFeatures():
    files = []
    print "Getting example names..."
    with open('./labels') as f:
        for line in f:
            files.append("_".join(line.split("\t")[0].split("_")[:-1]) +
                         '.wav')
    f.close()

    print "Walking through files to extract features..."
    for dirpath, dirnames, filenames in os.walk(CLIPS_DIR_PATH):
        for f in filenames:
            if f in files:
                print "Extracting for", f
                path = './clips/clips/' + f
                audiofile = AudioSegment.from_file(path)
                data = np.fromstring(audiofile._data, np.int16)
                Fs = audiofile.frame_rate
                x = []
                for chn in xrange(audiofile.channels):
                    x.append(data[chn::audiofile.channels])
                x = np.array(x).T
                if x.ndim == 2:
                    if x.shape[1] == 1:
                        x = x.flatten()
                try:
                    features = audioFeatureExtraction.stFeatureExtraction(
                        x, Fs, 0.050 * Fs, 0.025 * Fs).T
                    np.save(FEATURE_PICKLES + f, features)
                except ValueError as e:
                    print e
示例#17
0
def process_request():
    data = request.get_json(force=True)
    feature_type = data['feat_type']
    assert feature_type in ['raw', 'mfcc', 'all']
    model_input = data['m_in']

    y = pd.read_json(data['feat']).as_matrix().squeeze()
    # get the sampled raw signal
    if feature_type == 'raw':
        sr = data['sr']
        if model_input == 'mfcc':
            feat = librosa.feature.mfcc(y=y, sr=int(sr),
                                        n_mfcc=40).T  # (S, 40)
            feat = th.FloatTensor(feat).unsqueeze(0)
        elif model_input == 'all':
            all_feats, f_names = audioFeatureExtraction.stFeatureExtraction(
                y, sr, 2048, 512)
            feat = th.FloatTensor(all_feats.T).unsqueeze(0)

    # get the mfcc directly
    elif feature_type == 'mfcc':
        feat = th.FloatTensor(y).view(-1, 40).unsqueeze(0)

    elif feature_type == 'all':
        feat = th.FloatTensor(y).view(-1, 34).unsqueeze(0)

    # load model
    model = SER(h_size=200, feat_size=feat.size(2), class_num=4, dropout=0.)
    #model.cuda()
    model.eval()
    #model.load_state_dict(th.load('checkpoint/model.pt'))
    pred = model(feat, [feat.size(1)], None, None)
    pred = pred.max(dim=1)[1].item()

    return str(pred)
示例#18
0
 def __init__(self, filename):
     [Fs, x] = audioBasicIO.readAudioFile(filename)
     F = audioFeatureExtraction.stFeatureExtraction(
         np.mean(x, axis=1) if x.ndim == 2 else x, Fs, 0.050 * Fs,
         0.025 * Fs)
     # print (F[0][1])
     self.input_from_audio = F[0][1]
def build_MFCC_for_one_sound_slice(folder, sound_slice):
    '''
    builds the MFCC coeffs, given the sound_slice and folder containing the sound_slice
    :param file: str
    :param sound_slice:
    :return: array
    folder is the folder name where sound_slice is
    sound_slice must be in .wav format, not .mp3 format, in order to apply pyAusioAnalysis
    '''

    sound_slice_fullname = os.path.join(folder, sound_slice)
    [Fs, x] = audioBasicIO.readAudioFile(sound_slice_fullname)
    F, f_names = audioFeatureExtraction.stFeatureExtraction(
        x, Fs, 0.050 * Fs, 0.025 * Fs)
    G = resample_matrix(F,
                        num=15)  # see the definitionog resample_matrix above
    #feat_list=[]
    #feat_list.append(G)
    '''Below, we extract the MFCC's from the extracted and then resampled features above 
    .For each i, MFCC_list[i] gives for the i-th audio, all 13 feature vectors, each of them 15-dimensional 
    resampled values of the original feature vectors'''
    MFCC_list = []
    #for i in range(len(feat_list)):
    #MFCC = feat_list[i][8:21, :]  # 9th to 21-st features are the MFCC coeffs
    MFCC = G[8:21, :]  # 8th to 20 th features are the MFCC coeffs
    MFCC_flat = np.ndarray.flatten(
        MFCC
    )  # flatening the array, but are we destroying time series structure?
    MFCC_flattned_as_list = list(
        MFCC_flat
    )  #MFCC_array was np array, so we convert it into list, to avoid dim like [1, foo, bar]
    #MFCC_list.append(MFCC_flat_as_list)
    #MFCC_array = np.asarray(MFCC_list)
    return MFCC_flattned_as_list
示例#20
0
def extract_mfcc_features(filename):
    print(filename)
    [Fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs,
                                                   0.025 * Fs)
    mfcc = F[8:21, :]  # 13 mfcc features
    return mfcc
示例#21
0
def beatExtractionWrapper(wav_file, plot):
    if not os.path.isfile(wav_file):
        raise Exception("Input audio file not found!")
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    F, _ = aF.stFeatureExtraction(x, fs, 0.050 * fs, 0.050 * fs)
    bpm, ratio = aF.beatExtraction(F, 0.050, plot)
    print("Beat: {0:d} bpm ".format(int(bpm)))
    print("Ratio: {0:.2f} ".format(ratio))
示例#22
0
def beatExtractionWrapper(wav_file, plot):
    if not os.path.isfile(wav_file):
        raise Exception("Input audio file not found!")
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    F, _ = aF.stFeatureExtraction(x, fs, 0.050 * fs, 0.050 * fs)
    bpm, ratio = aF.beatExtraction(F, 0.050, plot)
    print("Beat: {0:d} bpm ".format(int(bpm)))
    print("Ratio: {0:.2f} ".format(ratio))
示例#23
0
def read_files(files):
	X = []
	for fn in tqdm(files):
	    y, sr = librosa.load(fn, sr=8000)
	    y = preprocess(y)
	    features = audioFeatureExtraction.stFeatureExtraction(y, sr, 0.10*sr, .05*sr)
	    X.extend(features)
	return X
示例#24
0
def featureExtractor(fileName):
	[Fs, x] = audioBasicIO.readAudioFile(fileName)
	Features = audioFeatureExtraction.stFeatureExtraction(x, Fs,0.001 * Fs, 0.0003 * Fs)
	MFCCs = []
	
	for index in range(len(Features)):
		MFCCs.append(float(np.mean(Features[index])))
	return MFCCs
示例#25
0
def beatExtractionWrapper(wavFileName, plot):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
    F = aF.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.050 * Fs)
    BPM, ratio = aF.beatExtraction(F, 0.050, plot)
    print("Beat: {0:d} bpm ".format(int(BPM)))
    print("Ratio: {0:.2f} ".format(ratio))
示例#26
0
def getStVectorPerWav(wavFile, stWin,
                      stStep):  # given a wav, get entire sT features
    [Fs, x] = getTotalAudio([wavFile])
    ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs)
    [featuresNormSS, MEANSS, STDSS
     ] = aT.normalizeFeatures([ShortTermFeatures])  # normalize to 0-mean 1-std
    [X, y] = featureListToVectors([featuresNormSS])
    return X, y, Fs
示例#27
0
def _get_feature(file_path):
    """使用pyAudioAnalysis获取特征名称和对应的特征值"""
    # [fs, x] = audioBasicIO.readAudioFile(file_path)
    sr, x = read(file_path)
    x = audioBasicIO.stereo2mono(x)
    f, f_names = audioFeatureExtraction.stFeatureExtraction(x, sr, 0.050 * sr, 0.025 * sr)

    return f_names, [np.mean(fm) for fm in f]
示例#28
0
def extract_mfcc(signal: np.ndarray,
                 sample_rate: int = 44100,
                 window: float = 0.5,
                 stride: float = 0.25):
    feats, f_names = audioFeatureExtraction.stFeatureExtraction(
        signal, sample_rate, sample_rate * window, stride * sample_rate)

    return feats.T, f_names
def preProcess(fileName):
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    if (len(x.shape) > 1 and x.shape[1] == 2):
        x = np.mean(x, axis=1, keepdims=True)
    else:
        x = x.reshape(x.shape[0], 1)
    F, f_names = audioFeatureExtraction.stFeatureExtraction(
        x[:, 0], Fs, 0.050 * Fs, 0.025 * Fs)
    return (f_names, F)
示例#30
0
 def get_desired_features(self, int_audio_frames):
     feature_vector = fe.stFeatureExtraction(int_audio_frames,
                                             self.sample_rate,
                                             self.frame_size_samples,
                                             self.frame_step_samples)
     output = []
     for desired_index in self.desired_features_indices:
         output.append(feature_vector[desired_index])
     return np.array(output)
示例#31
0
def analyze_audio_response(filename):
    #This function analyzes the yes/no response of a .wav file and returns a 34x (number of time frames depending on how long the response is) matrix
    from pyAudioAnalysis import audioBasicIO
    from pyAudioAnalysis import audioFeatureExtraction
    import matplotlib.pyplot as plt
    [Fs, x] = audioBasicIO.readAudioFile(filename)
    F = audioFeatureExtraction.stFeatureExtraction(x[:, 0], Fs, Fs * 0.05,
                                                   Fs * 0.025)
    return F
示例#32
0
        def extract_features(self, file_path):
            [Fs, x] = audioBasicIO.readAudioFile(file_path)

            x = audioBasicIO.stereo2mono(x)  # necessary conversion for pyaudio analysis
            features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            features = np.mean(features, axis=1)
            features = np.asarray(features).reshape(len(features), -1).transpose()
            # features_complete = np.append(features_complete, features, axis=0)
            return features  # _complete
示例#33
0
def process_mp3_files():
    files = read_input()
    os.system("touch test.wav")
    for mp3_file in files:
        mean_value = []
        sound = AudioSegment.from_mp3(mp3_file)
        sound.export("test.wav", format="wav")
        # print mp3_file
        [Fs, x] = audioBasicIO.readAudioFile("test.wav")
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs)
        for i in range(len(F)):
            mean_value.append(numpy.mean(F[i]))
        compute_emotion(mean_value)
示例#34
0
文件: run.py 项目: hyunwooj/unm-cs429
def add_audio_feature_extraction(Fs, x, label_id, features, labels):
    """
    Input
        Fs: frequency
        x: signal
        label_id: label(genre) id
        features: array of ffts
        labels: array of labels
    Description
        extracts a bunch of features listed in here(https://github.com/tyiannak/pyAudioAnalysis/wiki/3.-Feature-Extraction)
        from x and appends it to features.
    """
    F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs);
    features.append(F)
    labels.append(label_id)
示例#35
0
def showFeatures(name):
    print("processing - " + name)
    [Fs, x] = audioBasicIO.readAudioFile(name)
    # print(x)
    F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.50 * Fs, 0.25 * Fs)
    # print(x.size, Fs, 0.50 * Fs, 0.25 * Fs)
    # a = F[0, :]
    # numpy.savetxt("foo.csv", a, delimiter=",")

    # plt.subplot(3, 1, 1)
    # plt.plot(F[0, :])
    # plt.xlabel('Frame no')
    # plt.ylabel('ZCR')
    #
    # plt.subplot(3, 1, 2)
    # plt.plot(F[1, :])
    # plt.xlabel('Frame no')
    # plt.ylabel('Energy')
    #
    # plt.subplot(3, 1, 3)
    # plt.plot(F[3, :])
    # plt.xlabel('Frame no')
    # plt.ylabel('SC')
    #
    # plt.show()
    # items = ' '.join(map(str, a))
    # print(items)
    # print("--", F[0, :])
    vec = [
        F[0, :].mean(), F[1, :].mean(), F[4, :].mean(), F[5, :].mean(), F[6, :].mean(), F[7, :].mean(),
        F[0, :].std(), F[1, :].std(), F[4, :].std(), F[5, :].std(), F[6, :].std(), F[7, :].std()
    ]

    vecstr = ' '.join(map(str, vec))
    print("vector in audio.py : ",vecstr);
    melfeat = melfeature(F)
    # chromafeat = chromafeature(F)
    return vecstr + " " + melfeat
def main(argv):
	if argv[1] == "-shortTerm":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)
			t1 = time.clock()
			F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "short-term feature extraction: {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-classifyFile":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aT.fileClassification("diarizationExample.wav", "svmSM","svm")
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-mtClassify":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			[flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '')
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-hmmSegmentation":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '')             
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-silenceRemoval":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)				
			t1 = time.clock()
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow = 1.0, Weight = 0.3, plot = False)
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Silence removal \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-thumbnailing":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()
			[A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0)	# find thumbnail endpoints			
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-noLDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, LDAdim = 0, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-LDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
示例#37
0
		raise Exception("Dimension of the covariance matrix and data should match")
	invcov = cov.T
	mean = np.reshape(mean, (1, n))

	x = x - (np.ones((d, 1))*mean).T
	fact = np.sum(((np.dot(invcov, x))*x), axis = 1)

	y = np.exp(-0.5*fact)

	y = np.divide(y, math.pow((2*math.pi), n)*np.std(cov))

	return y


# feature extraction from the library pyAudioAnalysis
attribute = audioFeatureExtraction.stFeatureExtraction(x, Fs, SIZE_OF_WINDOW, SIZE_OF_STEP)

# relationship between the similarity and the timestamp in the audio
relation = [[1 for col in range(2)] for row in range(attribute.shape[1]/BLOCK_STEP)]

while (END_OF_FILE == 0):
	if (FIRST_PAIR == 1):
		# for the first pari of the block
		block_i_index_start = 0
		block_i_index_end = BLOCK_SIZE
		block_i_attribute = getMFCCs(block_i_index_start, block_i_index_end)


		block_i_mean = np.mean(block_i_attribute, axis=1)
		block_i_cov = np.cov(block_i_attribute)
		block_i_log_like = np.log(gauss(block_i_attribute, mean=block_i_mean, cov=block_i_cov))
示例#38
0
def features(filename, tag):
	signal, sampfreq = lr.load(filename)
	features = afe.stFeatureExtraction(signal, sampfreq, 0.050 * sampfreq, 0.025 * sampfreq)
	return signal, sampfreq, features
示例#39
0
def train_classifier():
    data_set = []
    for file in os.listdir("training_dataset/unhappy"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/unhappy/"+file
            sound = AudioSegment.from_mp3("training_dataset/unhappy/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(1)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/happy"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/happy/"+file
            sound = AudioSegment.from_mp3("training_dataset/happy/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(2)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/angry"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/angry/"+file
            sound = AudioSegment.from_mp3("training_dataset/angry/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(3)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/neutral"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/neutral/"+file
            sound = AudioSegment.from_mp3("training_dataset/neutral/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(4)
            data_set.append(mean_value)

    x = []
    y = []
    for i in range(len(data_set)):
        x.append(data_set[i][0])
        y.append(data_set[i][1])

    clf = RandomForestClassifier(n_estimators=30, max_features=6, max_depth=None, min_samples_split=1, bootstrap=True)
    clf = clf.fit(x, y)
    f2 = open("classifier.pickle", "wb")
    pickle.dump(clf, f2)
    f2.close()
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt


[Fs1, x1] = audioBasicIO.readAudioFile("happy.wav");
[Fs2, x2] = audioBasicIO.readAudioFile("sad.wav");
# Fs is frequency
# x is real data

th = 100 # fixed fea length
k12 = (len(x1)-800)/th/float(Fs1)
k22 = (len(x2)-800)/th/float(Fs2)


F1, f_names1 = audioFeatureExtraction.stFeatureExtraction(x1, Fs1, 0.05*Fs1, k12*Fs1);
F2, f_names2 = audioFeatureExtraction.stFeatureExtraction(x2, Fs2, 0.05*Fs2, k22*Fs2);
# stFeatureExtraction(signal, fs, win, step):
# signal:       the input signal samples
# fs:           the sampling freq (in Hz)
# win:          the short-term window size (in samples)
# step:         the short-term window step (in samples)
'''
here, 
window size = 0.05*Fs = 0.05*16000 = 800
step size = 0.025*Fs = 0.024*16000 = 400
we can get n frames from signal with length 23776

400*n+800=23776 -> n=57.44 = 58

as below F.shape = (34,58)
示例#41
0
def main(path):
    ds = Dataset(path)
    loader = Loader(path + "/train/", 32, 16)
    X = []
    y = []
    Z = []
    ii = 0
    for p in ds.trainTracks():
        f = p.split("/")
        name = f[len(f) - 1]
        labelTeller = loader.loadLabelsForSoundfile(name)
        [Fs, x] = audioBasicIO.readAudioFile(p)
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs)
        G = zip(*F)
        N = 0
        if len(G) > labelTeller.tellNoOfAllBlocks():
            N = labelTeller.tellNoOfAllBlocks()
        else:
            N = len(G)

        for i in range(N):
            Z.append([G[i], labelTeller.tell(i)])

        # i = 0
        # for w in ds.windows(x,44100, 1410, 705):
        #    mf = mfcc(w)
        # row = [i]
        #    Z.append([mf[0],labelTeller.tell(i)])
        #    i = i+1

        print p + " " + str(ii) + "/61"
        ii = ii + 1

    print "shuffle"
    random.shuffle(Z)
    Z = zip(*Z)

    NN = 20000
    L = NN
    R = NN
    FINAL = [[], []]
    for i in range(len(Z[0])):
        if Z[1][i] == "sing" and L > 0:
            L = L - 1
            FINAL[0].append(Z[0][i])
            FINAL[1].append(Z[1][i])

        if Z[1][i] == "nosing" and R > 0:
            R = R - 1
            FINAL[0].append(Z[0][i])
            FINAL[1].append(Z[1][i])

    clf = svm.SVC(cache_size=2000)
    print "######### " + str(len(Z[0]))
    clf.fit(FINAL[0], FINAL[1])
    loader = Loader(path + "/test/", 32, 16)

    print "Loading test"
    for p in ds.validationTracks():
        X = []
        y = []
        f = p.split("/")
        name = f[len(f) - 1]
        labelTeller = loader.loadLabelsForSoundfile(name)
        i = 0

        [Fs, x] = audioBasicIO.readAudioFile(p)
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs)
        G = zip(*F)
        N = 0
        if len(G) > labelTeller.tellNoOfAllBlocks():
            N = labelTeller.tellNoOfAllBlocks()
        else:
            N = len(G)

        for i in range(N):
            X.append(G[i])
            y.append(labelTeller.tell(i))

        print "Starting prediction " + p

        Y = clf.predict(X)
        ok = 0
        al = 0
        for i in range(len(y)):
            if y[i] == Y[i]:
                ok = ok + 1
            al = al + 1

        print ok / float(al)
示例#42
0
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt
[Fs, x] = audioBasicIO.readAudioFile("../audio_data/doremi.wav")
print Fs
print len(x)
#using a frame size of 50 msecs and a frame step of 25 msecs (50% overlap)
F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs)
"""
    stFeatureExtraction
    This function implements the shor-term windowing process. For each short-term window a set of features is extracted.
    This results to a sequence of feature vectors, stored in a numpy matrix.
    ARGUMENTS
        signal:       the input signal samples
        Fs:           the sampling freq (in Hz)
        Win:          the short-term window size (in samples)
        Step:         the short-term window step (in samples)
    RETURNS
        stFeatures:   a numpy array (numOfFeatures x numOfShortTermWindows)
"""
print len(F)
plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR')
plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel('Energy'); plt.show()