Python viterbiTrackingArray 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tracking

메소드/함수: viterbiTrackingArray

hotexamples.com에서의 예제들: 8

Python viterbiTrackingArray - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tracking.viterbiTrackingArray에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: separateLeadStereoParam.py 프로젝트: dkdfirefly/speaker_project

def main():
    import optparse
    
    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "solo (vocal) part",
                      default="estimated_solo.wav")
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "music part",
                      default="estimated_music.wav")
    parser.add_option("-p", "--pitch-output-file",
                      dest="pitch_output_file", type="string",
                      help="name of the output file for the estimated pitches",
                      default="pitches.txt")
    
    # Some more optional options:
    parser.add_option("-d", "--with-display", dest="displayEvolution",
                      action="store_true",help="display the figures",
                      default=False)
    parser.add_option("-q", "--quiet", dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    parser.add_option("--nb-iterations", dest="nbiter",
                      help="number of iterations", type="int",
                      default=100)
    parser.add_option("--window-size", dest="windowSize", type="float",
                      default=0.04644,help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=2048,
                      help="size of Fourier transforms, "\
                           "in samples.")
    parser.add_option("--hopsize", dest="hopsize", type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")
    parser.add_option("--nb-accElements", dest="R", type="float",
                      default=40.0,
                      help="number of elements for the accompaniment.")
    
    parser.add_option("--with-melody", dest="melody", type="string",
                      default=None,
                      help="provide the melody in a file named MELODY, "\
                           "with at each line: <time (s)><F0 (Hz)>.")
    
    (options, args) = parser.parse_args()
    
    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")
        
    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab
        
        ## plt.rc('text', usetex=True)
        plt.rc('image',cmap='jet') ## gray_r
        plt.ion()
        
    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max() # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData # makes data vary from -1 to 1

    tmp = np.zeros((data.size, 2))
    tmp[:,0] = data
    tmp[:,1] = data
    data = tmp

    if data.shape[0] == data.size: # data is multi-channel
        print "The audio file is not stereo. Try separateLead.py instead."
        raise ValueError("number of dimensions of the input not 2")
    if data.shape[1] != 2:
        print "The data is multichannel, but not stereo... \n"
        print "Unfortunately this program does not scale well. Data is \n"
        print "reduced to its 2 first channels.\n"
        data = data[:,0:2]
    
    # Processing the options:
    windowSizeInSamples = np.round(options.windowSize * fs)
    hopsize = np.round(options.hopsize * fs)
    NFT = options.fourierSize
    niter = options.nbiter
    R = options.R
    
    if options.verbose:
        print "Some parameter settings:"
        print "    Size of analysis windows: ", windowSizeInSamples
        print "    Hopsize: ", hopsize
        print "    Size of Fourier transforms: ", NFT
        print "    Number of iterations to be done: ", niter
        print "    Number of elements in WM: ", R 
    
    XR, F, N = stft(data[:,0], fs=fs, hopsize=hopsize,
                   window=sinebell(windowSizeInSamples), nfft=NFT)
    XL, F, N = stft(data[:,1], fs=fs, hopsize=hopsize,
                   window=sinebell(windowSizeInSamples), nfft=NFT)
    # SX is the power spectrogram:
    ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
    ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
    SXR = np.abs(XR) ** 2
    SXL = np.abs(XL) ** 2
    
    del data, F, N
    
    # TODO: also process these as options:
    eps = 10 ** -9
    minF0 = 100
    maxF0 = 800
    Fs = fs
    F, N = SXR.shape
    stepNotes = 20 # this is the number of F0s within one semitone
    # until 17/09/2010 : stepNotes = 20
    # 17/09/2010 : trying stepNotes = 8, checking for less artefacts
    
    K = 10 # number of spectral shapes for the filter part
    # R = 40 # number of spectral shapes for the accompaniment
    P = 30 # number of elements in dictionary of smooth filters
    chirpPerF0 = 1 # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.
    
    # Create the harmonic combs, for each F0 between minF0 and maxF0: 
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :] # ensure same size as SX 
    NF0 = F0Table.size # number of harmonic combs
    # Normalization: 
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
    
    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)
    
    if displayEvolution:
        plt.figure(1);plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        
    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        SX = np.maximum(np.abs((XR + XL) / 2.0) ** 2, 10 ** -8)
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for 
            HGAMMA0=None, HPHI0=None,
            HF00=None,
            WM0=None, HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter, updateRulePower=1.,
            stepNotes=stepNotes, 
            lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
            verbose=options.verbose, displayEvolution=displayEvolution)
        
        if displayEvolution:
            h2 = plt.figure(2);plt.clf();
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])
            
        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]
        
        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
        p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
        p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0
        
        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))
        
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)
        
        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100)
        
        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)
        
        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')
            ##         raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
            ##                   "!! Press Return to resume the program  !!\n"\
            ##                   "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            
        del logHF0
        
        # detection of silences:
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * np.floor(stepNotes / scopeAllowedHF0) \
                                        + 1))) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 \
                                         * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 \
                                         * (np.floor(stepNotes / scopeAllowedHF0) \
                                            + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, N * chirpPerF0 \
                               * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                                  + 1))
        dim2index = np.outer(np.arange(N),
                             np.ones(chirpPerF0 \
                                     * (2 * np.floor(stepNotes \
                                                     / scopeAllowedHF0) + 1), \
                                     dtype=int)\
                             ).reshape(1, N * chirpPerF0 \
                                       * (2 * np.floor(stepNotes \
                                                       / scopeAllowedHF0) \
                                          + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max()
        
        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \
                                  (XR+XL) * 0.5) \
                           ** 2, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N
        
        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0
        
    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:
            print "The melody should be provided as <Time (s)><F0 (Hz)>."
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:,1]
        if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz>40.0].min() *.97
            maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03)
            print "Recomputing the source basis for "
            print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz."
            # Create the harmonic combs, for each F0 between minF0 and maxF0: 
            F0Table, WF0 = \
                     generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                          stepNotes=stepNotes, \
                                          lengthWindow=windowSizeInSamples,
                                          Ot=0.25, \
                                          perF0=chirpPerF0, \
                                          depthChirpInSemiTone=.15)
            WF0 = WF0[0:F, :] # ensure same size as SX 
            NF0 = F0Table.size # number of harmonic combs
            # Normalization: 
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
            
        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]),
                                            sigTimeStamps) -
                                   np.outer(melTimeStamps, np.ones(N)))
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) -
                                   np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \
                      0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath<=0] = 0
        
    freqMelody = F0Table[np.array(indexBestPath,dtype=int)]
    freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0]
    np.savetxt(options.pitch_output_file,
               np.array([np.arange(N) * hopsize / np.double(Fs),
                         freqMelody]).T)
    
    # Second round of parameter estimation, with specific
    # initial HF00:
    HF00 = np.zeros([NF0 * chirpPerF0, N])
    
    scopeAllowedHF0 = 2.0 / 1.0
    
    # indexes for HF00:
    # TODO: reprogram this with a 'where'?...
    dim1index = np.array(\
        np.maximum(\
        np.minimum(\
        np.outer(chirpPerF0 * indexBestPath,
                 np.ones(chirpPerF0 \
                         * (2 \
                            * np.floor(stepNotes / scopeAllowedHF0) \
                            + 1))) \
        + np.outer(np.ones(N),
                   np.arange(-chirpPerF0 \
                             * np.floor(stepNotes / scopeAllowedHF0),
                             chirpPerF0 \
                             * (np.floor(stepNotes / scopeAllowedHF0) \
                                + 1))),
        chirpPerF0 * NF0 - 1),
        0),
        dtype=int)
    dim1index = dim1index[indexBestPath!=0,:]
    ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
    ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
    ##                          + 1))
    dim1index = dim1index.reshape(1,dim1index.size)
    
    dim2index = np.outer(np.arange(N),
                         np.ones(chirpPerF0 \
                                 * (2 * np.floor(stepNotes \
                                                 / scopeAllowedHF0) + 1), \
                                 dtype=int)\
                         )
    dim2index = dim2index[indexBestPath!=0,:]
    dim2index = dim2index.reshape(1,dim2index.size)
    ## dim2index.reshape(1, N * chirpPerF0 \
    ##                                * (2 * np.floor(stepNotes \
    ##                                                / scopeAllowedHF0) \
    ##                                   + 1))
    HF00[dim1index, dim2index] = 1 # HF0.max()
    
    HF00[:, indexBestPath == (NF0 - 1)] = 0.0
    HF00[:, indexBestPath == 0] = 0.0
    
    
    WF0effective = WF0
    HF00effective = HF00
    
    if options.melody is None:
        del HF0, HGAMMA, HPHI, HM, WM, HF00, SX
        
    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WF0effective,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=None, HPHI0=None,
        HF00=HF00effective,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)
    
    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WF0effective, HF0)
    
    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
    
    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
    
    vestR = istft(hatVR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
    
    vestL = istft(hatVR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
    #                          options.voc_output_file, fs)
    
    vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
    wav.write(options.voc_output_file, fs, \
              np.array([vestR,vestL]).T)
    
    #wav.write(options.voc_output_file, fs, \
    #          np.int16(32768.0 * np.array([vestR,vestL]).T))
    
    hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
    
    mestR = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
    
    mestL = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
    #                          options.mus_output_file, fs)
    
    mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
    wav.write(options.mus_output_file, fs, \
              np.array([mestR,mestL]).T)
    
    #wav.write(options.mus_output_file, fs, \
    #          np.int16(32768.0 * np.array([mestR,mestL]).T))
    
    del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0
    
    # adding the unvoiced part in the source basis:
    WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
    HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
    ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?
    
    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WUF0,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=HGAMMA, HPHI0=HPHI,
        HF00=HUF0,
        WM0=None,#WM,
        HM0=None,#HM,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution,
        updateHGAMMA=False)
    
    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WUF0, HF0)
    
    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
    
    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
    
    vestR = istft(hatVR, hopsize=hopsize, nfft=NFT,
                  window=sinebell(windowSizeInSamples)) / 4.0
    
    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
    
    vestL = istft(hatVR, hopsize=hopsize, nfft=NFT,
                  window=sinebell(windowSizeInSamples)) / 4.0
    
    outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
    # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs)
    
    vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
    wav.write(outputFileName, fs, \
              np.array([vestR,vestL]).T)
    
    hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
    
    mestR = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
    
    mestL = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs)
    
    mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
    wav.write(outputFileName, fs, \
              np.array([mestR,mestL]).T)
    
    if displayEvolution:
        plt.close('all')
        ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##           "!! Press Return to end the program...  !!\n"\
        ##           "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    
    print "Done!"

예제 #2

파일 보기

def main(inputAudioFile):
    import optparse

    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "solo (vocal) part",
                      default="estimated_solo.wav")
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "music part",
                      default="estimated_music.wav")
    parser.add_option("-p",
                      "--pitch-output-file",
                      dest="pitch_output_file",
                      type="string",
                      help="name of the output file for the estimated pitches",
                      default="pitches.txt")

    # Some more optional options:
    parser.add_option("-d",
                      "--with-display",
                      dest="displayEvolution",
                      action="store_true",
                      help="display the figures",
                      default=False)
    parser.add_option("-q",
                      "--quiet",
                      dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    #Number of iterations
    parser.add_option("--nb-iterations",
                      dest="nbiter",
                      help="number of iterations",
                      type="int",
                      default=50)
    parser.add_option("--window-size",
                      dest="windowSize",
                      type="float",
                      default=0.04644,
                      help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=2048,
                      help="size of Fourier transforms, "\
                           "in samples.")
    parser.add_option("--hopsize",
                      dest="hopsize",
                      type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")
    parser.add_option("--nb-accElements",
                      dest="R",
                      type="float",
                      default=40.0,
                      help="number of elements for the accompaniment.")

    parser.add_option("--with-melody", dest="melody", type="string",
                      default=None,
                      help="provide the melody in a file named MELODY, "\
                           "with at each line: <time (s)><F0 (Hz)>.")

    (options, args) = parser.parse_args()
    #if len(args) != 1:
    #parser.error("incorrect number of arguments, use option -h for help.")

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        ## plt.rc('text', usetex=True)
        plt.rc('image', cmap='jet')  ## gray_r
        plt.ion()

    # Compulsory option: name of the input file:
    #inputAudioFile = args[0]
    fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max()  # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData  # makes data vary from -1 to 1
    if data.shape[0] == data.size:  # data is multi-channel
        print("The audio file is not stereo. Try separateLead.py instead.")
        raise ValueError("number of dimensions of the input not 2")
    if data.shape[1] != 2:
        print("The data is multichannel, but not stereo... \n")
        print("Unfortunately this program does not scale well. Data is \n")
        print("reduced to its 2 first channels.\n")
        data = data[:, 0:2]

    # Processing the options:
    windowSizeInSamples = np.round(options.windowSize * fs)
    hopsize = np.round(options.hopsize * fs)
    NFT = options.fourierSize
    niter = options.nbiter
    R = options.R

    if options.verbose:
        print("Some parameter settings:")
        print("    Size of analysis windows: ", windowSizeInSamples)
        print("    Hopsize: ", hopsize)
        print("    Size of Fourier transforms: ", NFT)
        print("    Number of iterations to be done: ", niter)
        print("    Number of elements in WM: ", R)

    XR, F, N = stft(data[:, 0],
                    fs=fs,
                    hopsize=hopsize,
                    window=sinebell(windowSizeInSamples),
                    nfft=NFT)
    XL, F, N = stft(data[:, 1],
                    fs=fs,
                    hopsize=hopsize,
                    window=sinebell(windowSizeInSamples),
                    nfft=NFT)
    # SX is the power spectrogram:
    ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
    ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
    SXR = np.abs(XR)**2
    SXL = np.abs(XL)**2

    del data, F, N

    # TODO: also process these as options:
    eps = 10**-9
    minF0 = 100
    maxF0 = 800
    Fs = fs
    F, N = SXR.shape
    stepNotes = 20  # this is the number of F0s within one semitone
    # until 17/09/2010 : stepNotes = 20
    # 17/09/2010 : trying stepNotes = 8, checking for less artefacts

    K = 10  # number of spectral shapes for the filter part
    # R = 40 # number of spectral shapes for the accompaniment
    P = 30  # number of elements in dictionary of smooth filters
    chirpPerF0 = 1  # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0:
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :]  # ensure same size as SX
    NF0 = F0Table.size  # number of harmonic combs
    # Normalization:
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)

    if displayEvolution:
        plt.figure(1)
        plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        SX = np.maximum(np.abs((XR + XL) / 2.0)**2, 10**-8)
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K,
            numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for
            HGAMMA0=None,
            HPHI0=None,
            HF00=None,
            WM0=None,
            HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter,
            updateRulePower=1.,
            stepNotes=stepNotes,
            lambdaHF0=0.0 / (1.0 * SX.max()),
            alphaHF0=0.9,
            verbose=options.verbose,
            displayEvolution=displayEvolution)

        if displayEvolution:
            h2 = plt.figure(2)
            plt.clf()
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])

        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]

        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1])  # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10**(-90)
        p0_0 = transitions[cutoffnote - 1] * 10**(-100)
        p0_f = transitions[cutoffnote - 1] * 10**(-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0

        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))

        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)

        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100)

        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)

        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')
            ##         raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
            ##                   "!! Press Return to resume the program  !!\n"\
            ##                   "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

        del logHF0

        # detection of silences:
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * np.floor(stepNotes / scopeAllowedHF0) \
                                        + 1))) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 \
                                         * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 \
                                         * (np.floor(stepNotes / scopeAllowedHF0) \
                                            + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, N * chirpPerF0 \
                               * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                                  + 1))
        dim2index = np.outer(np.arange(N),
                             np.ones(chirpPerF0 \
                                     * (2 * np.floor(stepNotes \
                                                     / scopeAllowedHF0) + 1), \
                                     dtype=int)\
                             ).reshape(1, N * chirpPerF0 \
                                       * (2 * np.floor(stepNotes \
                                                       / scopeAllowedHF0) \
                                          + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \
                                  (XR+XL) * 0.5) \
                           ** 2, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N

        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0

    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:

            print("The melody should be provided as <Time (s)><F0 (Hz)>.")
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:, 0]  # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:, 1]
        if minF0 > melFreqHz[
                melFreqHz > 40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz > 40.0].min() * .97
            maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03)
            print("Recomputing the source basis for ")
            print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.")
            # Create the harmonic combs, for each F0 between minF0 and maxF0:
            F0Table, WF0 = \
                     generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                          stepNotes=stepNotes, \
                                          lengthWindow=windowSizeInSamples,
                                          Ot=0.25, \
                                          perF0=chirpPerF0, \
                                          depthChirpInSemiTone=.15)
            WF0 = WF0[0:F, :]  # ensure same size as SX
            NF0 = F0Table.size  # number of harmonic combs
            # Normalization:
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(
            np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) -
            np.outer(melTimeStamps, np.ones(N)))
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(
            np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \
                      0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath <= 0] = 0

    freqMelody = F0Table[np.array(indexBestPath, dtype=int)]
    freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0]
    np.savetxt(
        options.pitch_output_file,
        np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T)

    # Second round of parameter estimation, with specific
    # initial HF00:
    HF00 = np.zeros([NF0 * chirpPerF0, N])

    scopeAllowedHF0 = 2.0 / 1.0

    # indexes for HF00:
    # TODO: reprogram this with a 'where'?...
    dim1index = np.array(\
        np.maximum(\
        np.minimum(\
        np.outer(chirpPerF0 * indexBestPath,
                 np.ones(chirpPerF0 \
                         * (2 \
                            * np.floor(stepNotes / scopeAllowedHF0) \
                            + 1))) \
        + np.outer(np.ones(N),
                   np.arange(-chirpPerF0 \
                             * np.floor(stepNotes / scopeAllowedHF0),
                             chirpPerF0 \
                             * (np.floor(stepNotes / scopeAllowedHF0) \
                                + 1))),
        chirpPerF0 * NF0 - 1),
        0),
        dtype=int)
    dim1index = dim1index[indexBestPath != 0, :]
    ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
    ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
    ##                          + 1))
    dim1index = dim1index.reshape(1, dim1index.size)

    dim2index = np.outer(np.arange(N),
                         np.ones(chirpPerF0 \
                                 * (2 * np.floor(stepNotes \
                                                 / scopeAllowedHF0) + 1), \
                                 dtype=int)\
                         )
    dim2index = dim2index[indexBestPath != 0, :]
    dim2index = dim2index.reshape(1, dim2index.size)
    ## dim2index.reshape(1, N * chirpPerF0 \
    ##                                * (2 * np.floor(stepNotes \
    ##                                                / scopeAllowedHF0) \
    ##                                   + 1))
    HF00[dim1index, dim2index] = 1  # HF0.max()

    HF00[:, indexBestPath == (NF0 - 1)] = 0.0
    HF00[:, indexBestPath == 0] = 0.0

    WF0effective = WF0
    HF00effective = HF00

    if options.melody is None:
        del HF0, HGAMMA, HPHI, HM, WM, HF00, SX

    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WF0effective,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=None, HPHI0=None,
        HF00=HF00effective,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes,
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)

    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WF0effective, HF0)

    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM)

    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR

    vestR = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL

    vestL = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
    #                          options.voc_output_file, fs)

    vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
    # wav.write(options.voc_output_file, fs, \
    #           np.array([vestR,vestL]).T)

    #wav.write(options.voc_output_file, fs, \
    #          np.int16(32768.0 * np.array([vestR,vestL]).T))

    hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR

    mestR = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL

    mestL = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
    #                          options.mus_output_file, fs)

    mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
    # wav.write(options.mus_output_file, fs, \
    #           np.array([mestR,mestL]).T)

    #wav.write(options.mus_output_file, fs, \
    #          np.int16(32768.0 * np.array([mestR,mestL]).T))

    del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0

    # adding the unvoiced part in the source basis:
    WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
    HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
    ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?

    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WUF0,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=HGAMMA, HPHI0=HPHI,
        HF00=HUF0,
        WM0=None,#WM,
        HM0=None,#HM,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes,
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution,
        updateHGAMMA=False)

    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WUF0, HF0)

    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM)

    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR

    vestR = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL

    vestL = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
    # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs)

    vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
    # wav.write(outputFileName, fs, \
    #           np.array([vestR,vestL]).T)

    hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR

    mestR = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL

    mestL = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    #This is the required file
    outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs)
    os.chdir('media/karaoke')
    mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
    wav.write(outputFileName, fs, \
              np.array([mestR,mestL]).T)

    if displayEvolution:
        plt.close('all')
        ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##           "!! Press Return to end the program...  !!\n"\
        ##           "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("Done!")
    print outputFileName
    os.chdir('..')
    os.chdir('..')

예제 #3

파일 보기

파일: separateLeadStereoParam.py 프로젝트: HENDRIX-ZT2/separateLeadStereo

def main():
    import optparse
    
    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "solo (vocal) part. \n"\
                           "If None, appends _lead to inputAudioFile.",
                      default=None)
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "music part.\n"\
                           "If None, appends _acc to inputAudioFile.",
                      default=None)
    parser.add_option("-p", "--pitch-output-file",
                      dest="pitch_output_file", type="string",
                      help="name of the output file for the estimated pitches.\n"
                           "If None, appends _pitches to inputAudioFile",
                      default=None)
    
    # Some more optional options:
    parser.add_option("-d", "--with-display", dest="displayEvolution",
                      action="store_true",help="display the figures",
                      default=False)
    parser.add_option("-q", "--quiet", dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    parser.add_option("-n", "--dontseparate", dest="separateSignals",
                      action="store_false",
                      help="Trigger this option if you only desire to "+\
                           "estimate the melody",
                      default=True)
    parser.add_option("--nb-iterations", dest="nbiter",
                      help="number of iterations", type="int",
                      default=30)
    parser.add_option("--window-size", dest="windowSize", type="float",
                      default=0.04644,help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=None,
                      help="size of Fourier transforms, "\
                           "in samples.")
    parser.add_option("--hopsize", dest="hopsize", type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")
    parser.add_option("--nb-accElements", dest="R", type="float",
                      default=40.0,
                      help="number of elements for the accompaniment.")
    
    parser.add_option("--with-melody", dest="melody", type="string",
                      default=None,
                      help="provide the melody in a file named MELODY, "\
                           "with at each line: <time (s)><F0 (Hz)>.")
    
    parser.add_option("--numAtomFilters", dest="P_numAtomFilters",
                      type="int", default=30,
                      help="Number of atomic filters - in WGAMMA.")
    parser.add_option("--numFilters", dest="K_numFilters", type="int",
                      default=10,
                      help="Number of filters for decomposition - in WPHI")
    parser.add_option("--min-F0-Freq", dest="minF0", type="float",
                      default=100.0,
                      help="Minimum of fundamental frequency F0.")
    parser.add_option("--max-F0-Freq", dest="maxF0", type="float",
                      default=800.0,
                      help="Maximum of fundamental frequency F0.")
    parser.add_option("--step-F0s", dest="stepNotes", type="int",
                      default=20,
                      help="Number of F0s in dictionary for each semitone.")
    
    (options, args) = parser.parse_args()
    
    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")
    
    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab
        
        ## plt.rc('text', usetex=True)
        plt.rc('image',cmap='jet') ## gray_r
        plt.ion()
        
    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    if inputAudioFile[-4:] != ".wav":
        raise ValueError("File not WAV file? Only WAV format support, for now...")
    
    if options.mus_output_file is None:
        options.mus_output_file = inputAudioFile[:-4]+'_acc.wav'
    
    if options.voc_output_file is None:
        options.voc_output_file = inputAudioFile[:-4]+'_lead.wav'
    
    if options.pitch_output_file is None:
        options.pitch_output_file = inputAudioFile[:-4]+'_pitches.txt'
    
    print("Writing the different following output files:")
    print("    separated lead          in", options.voc_output_file)
    print("    separated accompaniment in", options.mus_output_file)
    print("    separated lead + unvoc  in", options.voc_output_file[:-4] + \
          '_VUIMM.wav')
    print("    separated acc  - unvoc  in", options.mus_output_file[:-4] + \
          '_VUIMM.wav')
    print("    estimated pitches       in", options.pitch_output_file)
    
    Fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max() # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData # makes data vary from -1 to 1
    is_stereo = True
    if data.shape[0] == data.size: # data is multi-channel
        print("The audio file is not stereo. Making stereo out of mono.")
        print("(You could also try the older separateLead.py...)")
        is_stereo = False
        # data = np.vstack([data,data]).T 
        # raise ValueError("number of dimensions of the input not 2")
    if is_stereo and data.shape[1] != 2:
        print("The data is multichannel, but not stereo... \n")
        print("Unfortunately this program does not scale well. Data is \n")
        print("reduced to its 2 first channels.\n")
        data = data[:,0:2]
    
    # Processing the options:
    windowSizeInSamples = int(nextpow2(np.round(options.windowSize * Fs)) )
    
    hopsize = np.round(options.hopsize * Fs)
    if hopsize != windowSizeInSamples/8:
        #print "Overriding given hopsize to use 1/8th of window size"
        #hopsize = windowSizeInSamples/8
        warnings.warn("Chosen hopsize: "+str(hopsize)+\
                      ", while windowsize: "+str(windowSizeInSamples))
    
    if options.fourierSize is None:
        NFT = windowSizeInSamples
    else:
        NFT = options.fourierSize

    # number of iterations for each parameter estimation step: 
    niter = options.nbiter
    # number of spectral shapes for the accompaniment
    R = options.R
    
    eps = 10 ** -9
    
    if options.verbose:
        print("Some parameter settings:")
        print("    Size of analysis windows: ", windowSizeInSamples)
        print("    Hopsize: ", hopsize)
        print("    Size of Fourier transforms: ", NFT)
        print("    Number of iterations to be done: ", niter)
        print("    Number of elements in WM: ", R)
        
    if is_stereo:
        XR, F, N = stft(data[:,0], fs=Fs, hopsize=hopsize,
                        window=sinebell(windowSizeInSamples), nfft=NFT)
        XL, F, N = stft(data[:,1], fs=Fs, hopsize=hopsize,
                        window=sinebell(windowSizeInSamples), nfft=NFT)
        # SX is the power spectrogram:
        ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
        ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
        #SXR = np.abs(XR) ** 2
        #SXL = np.abs(XL) ** 2
        SX = np.maximum((0.5*np.abs(XR+XL)) ** 2, eps)
    else: # data is mono
        X, F, N = stft(data, fs=Fs, hopsize=hopsize,
                       window=sinebell(windowSizeInSamples), nfft=NFT)
        SX = np.maximum(np.abs(X) ** 2, eps)
    
    del data, F, N
    
    # TODO: also process these as options:
    # minimum and maximum F0 in glottal source spectra dictionary
    minF0 = options.minF0
    maxF0 = options.maxF0
    F, N = SX.shape
    stepNotes = options.stepNotes # this is the number of F0s within one semitone
    
    K = options.K_numFilters # number of spectral shapes for the filter part
    P = options.P_numAtomFilters # number of elements in dictionary of smooth filters
    chirpPerF0 = 1 # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.
    
    # Create the harmonic combs, for each F0 between minF0 and maxF0: 
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :] # ensure same size as SX 
    NF0 = F0Table.size # number of harmonic combs
    # Normalization: 
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
    
    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)
    
    if displayEvolution:
        plt.figure(1);plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        
    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for 
            HGAMMA0=None, HPHI0=None,
            HF00=None,
            WM0=None, HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter, updateRulePower=1.,
            stepNotes=stepNotes, 
            lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
            verbose=options.verbose, displayEvolution=displayEvolution)
        
        if displayEvolution:
            h2 = plt.figure(2);plt.clf();
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])
            
        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        # create transition probability matrix - adhoc parameter 'scale'
        # TODO: use "learned" parameter scale (NB: after many trials,
        # provided scale and parameterization seems robust)
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]
        
        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
        p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
        p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0
        
        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))
        
        # prior probabilities, and setting the array for Viterbi tracking:
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)
        
        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100)
        
        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)
        
        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')
        
        del logHF0
        
        # detection of silences:
        # computing the melody restricted F0 amplitude matrix HF00
        # (which will be used as initial HF0 for further algo):
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        # computing indices for and around the melody indices,
        # dim1index are indices along axis 0, and dim2index along axis 1
        # of HF0:
        #     TODO: use numpy broadcasting to make this "clearer" (if possible...)
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) )) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)))
        dim2index = np.outer(np.arange(N),
                             np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int)\
                             ).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max()
        
        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0
        
        # remove frames with less than (100 thres_energy) % of total energy. 
        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum((((SPHI * SF0)/hatSX)**2) * SX, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N
        
        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0
        
    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:
            print("The melody should be provided as <Time (s)><F0 (Hz)>.")
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:,1]
        if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz>40.0].min() *.97
            maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03)
            print("Recomputing the source basis for ")
            print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.")
            # Create the harmonic combs, for each F0 between minF0 and maxF0: 
            F0Table, WF0 = \
                     generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                          stepNotes=stepNotes, \
                                          lengthWindow=windowSizeInSamples,
                                          Ot=0.25, \
                                          perF0=chirpPerF0, \
                                          depthChirpInSemiTone=.15)
            WF0 = WF0[0:F, :] # ensure same size as SX 
            NF0 = F0Table.size # number of harmonic combs
            # Normalization: 
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
            
        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]),
                                            sigTimeStamps) -
                                   np.outer(melTimeStamps, np.ones(N)))
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) -
                                   np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \
                      0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath<=0] = 0
        
    freqMelody = F0Table[np.array(indexBestPath,dtype=int)]
    freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0]
    np.savetxt(options.pitch_output_file,
               np.array([np.arange(N) * hopsize / np.double(Fs),
                         freqMelody]).T)
    
    # If separation is required:
    if options.separateSignals:
        # Second round of parameter estimation, with specific
        # initial HF00:
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        
        scopeAllowedHF0 = 2.0 / 1.0
        
        # indexes for HF00:
        # TODO: reprogram this with a 'where'?...
        dim1index = np.array(\
            np.maximum(\
            np.minimum(\
            np.outer(chirpPerF0 * indexBestPath,
                     np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1) ))) \
            + np.outer(np.ones(N),
                       np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                                 chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))),
            chirpPerF0 * NF0 - 1),
            0),
            dtype=int)
        dim1index = dim1index[indexBestPath!=0,:]
        ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
        ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
        ##                          + 1))
        dim1index = dim1index.reshape(1,dim1index.size)
        
        dim2index = np.outer(np.arange(N), np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int) )
        dim2index = dim2index[indexBestPath!=0,:]
        dim2index = dim2index.reshape(1,dim2index.size)
        ## dim2index.reshape(1, N * chirpPerF0 \
        ##                                * (2 * np.floor(stepNotes \
        ##                                                / scopeAllowedHF0) \
        ##                                   + 1))
        HF00[dim1index, dim2index] = 1 # HF0.max()
        
        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0
        
        
        WF0effective = WF0
        HF00effective = HF00
        
        if options.melody is None:
            del HF0, HGAMMA, HPHI, HM, WM, HF00
        
        if is_stereo:
            del SX
            SXR = np.maximum(np.abs(XR) ** 2, eps)
            SXL = np.maximum(np.abs(XL) ** 2, eps)
            alphaR, alphaL, HGAMMA, HPHI, HF0, \
                betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
                    # the data to be fitted to:
                    SXR, SXL,
                    # the basis matrices for the spectral combs
                    WF0effective,
                    # and for the elementary filters:
                    WGAMMA,
                    # number of desired filters, accompaniment spectra:
                    numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                    # if any, initial amplitude matrices for
                    HGAMMA0=None, HPHI0=None,
                    HF00=HF00effective,
                    WM0=None, HM0=None,
                    # Some more optional arguments, to control the "convergence"
                    # of the algo
                    numberOfIterations=niter, updateRulePower=1.0,
                    stepNotes=stepNotes, 
                    lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
                    verbose=options.verbose, displayEvolution=displayEvolution)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)
            
            hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
            hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
            
            hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
            
            vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
            
            vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
            #                          options.voc_output_file, Fs)
            
            vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, \
                      np.array([vestR,vestL]).T)
            
            #wav.write(options.voc_output_file, Fs, \
            #          np.int16(32768.0 * np.array([vestR,vestL]).T))
            
            hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
            
            mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
            
            mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
            #                          options.mus_output_file, Fs)
            
            mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, \
                      np.array([mestR,mestL]).T)
            
            #wav.write(options.mus_output_file, Fs, \
            #          np.int16(32768.0 * np.array([mestR,mestL]).T))
            
            del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0
        
            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?
            
            alphaR, alphaL, HGAMMA, HPHI, HF0, \
                betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
                    # the data to be fitted to:
                    SXR, SXL,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                # if any, initial amplitude matrices for
                HGAMMA0=HGAMMA, HPHI0=HPHI,
                HF00=HUF0,
                WM0=None,#WM,
                HM0=None,#HM,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter, updateRulePower=1.0,
                stepNotes=stepNotes, 
                lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
                verbose=options.verbose, displayEvolution=displayEvolution,
                updateHGAMMA=False)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)
            
            hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
            hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
            
            hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
            
            vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
            
            vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
            
            vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, \
                      np.array([vestR,vestL]).T)
            
            hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
            
            mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
            
            mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
            
            mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, \
                      np.array([mestR,mestL]).T)
        else:
            # running on monophonic data:
            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WF0effective,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for 
                HGAMMA0=None, HPHI0=None,
                HF00=HF00effective,
                WM0=None, HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter, updateRulePower=1.,
                stepNotes=stepNotes, 
                lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
                verbose=options.verbose, displayEvolution=displayEvolution)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)
            SM = np.dot(WM,HM)
            
            hatSX =  SF0 * SPHI + SM
            
            hatV = SPHI * SF0 / hatSX * X
            
            vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            vest = np.array(np.round(vest*scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, vest)
            
            hatM = SM / hatSX * X
            
            mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            mest = np.array(np.round(mest*scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, mest)
            
            del hatM, vest, mest, hatV, hatSX, SPHI, SF0
            
            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?
            
            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for 
                HGAMMA0=HGAMMA, HPHI0=HPHI,
                HF00=HUF0,
                WM0=None, HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter, updateRulePower=1.,
                stepNotes=stepNotes, 
                lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
                verbose=options.verbose, displayEvolution=displayEvolution,
                updateHGAMMA=False)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)
            SM = np.dot(WM,HM)
            
            hatSX =  SF0 * SPHI + SM
            
            hatV = SPHI * SF0 / hatSX * X
            
            vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            vest = np.array(np.round(vest*scaleData), dtype=dataType)
            outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
            wav.write(outputFileName, Fs, vest)
            
            hatM = SM / hatSX * X
            
            mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            mest = np.array(np.round(mest*scaleData), dtype=dataType)
            
            outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
            wav.write(outputFileName, Fs, mest)
            

        if displayEvolution:
            plt.close('all')
            
    print("Done!")

예제 #4

파일 보기

파일: SourceFilterModelSF.py 프로젝트: kukas/SourceFilterContoursMelody

def main(args, options):

    stereoEstimation = True

    # Median filtering in spectrogram
    HPS = False

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        ## plt.rc('text', usetex=True)
        plt.rc('image', cmap='jet')  ## gray_r
        plt.ion()

    # Compulsory option: name of the input file:
    inputAudioFile = ''
    if len(args) >= 2:
        inputAudioFile = args[0]
        options.pitch_output_file = args[1]
    if len(args) == 1:
        inputAudioFile = args[0]
    if len(args) == 0:
        inputAudioFile = options.input_file

    if inputAudioFile[-4:] != ".wav":
        raise ValueError(
            "File not WAV file? Only WAV format support, for now...")

    #print "Writing the different following output files:"
    if not (options.vit_pitch_output_file is None):
        print "    estimated pitches in", options.vit_pitch_output_file
    if not (options.sal_output_file is None):
        print "    salience file in ", options.sal_output_file

    if options.pitch_output_file is None:
        options.pitch_output_file = inputAudioFile[:-4] + '_pitches.txt'

    try:
        from essentia.standard import AudioLoader
        loaded = AudioLoader(filename=inputAudioFile)()
        audio = loaded[0]
        Fs = loaded[1]
        nchan = loaded[2]
        loaded = AudioLoader(filename=inputAudioFile)()
        audio = loaded[0]
        if nchan == 1:
            data = audio[:, 0].transpose()
        else:
            data = audio.transpose()

        data = np.double(data) / (1.2 * abs(data).max())
    except:
        # Using scipy to import wav
        import scipy.io.wavfile as wav
        Fs, data = wav.read(inputAudioFile)
        # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
        scaleData = 1.2 * data.max()  # to rescale the data.
        data = np.double(data) / scaleData  # makes data vary from -1 to 1
    options.Fs = Fs
    is_stereo = True
    if data.shape[0] == data.size:  # data is multi-channel
        #print "The audio file is not stereo."
        #print "The audio file is not stereo. Making stereo out of mono."
        #print "(You could also try the older separateLead.py...)"
        is_stereo = False
        # data = np.vstack([data,data]).T
        # raise ValueError("number of dimensions of the input not 2")
    if is_stereo and data.shape[1] != 2:
        print "The data is multichannel, but not stereo... \n"
        print "Unfortunately this program does not scale well. Data is \n"
        print "reduced to its 2 first channels.\n"
        data = data[:, 0:2]

    # Processing the options:
    windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs))

    hopsize = np.round(options.hopsize * Fs)
    #if hopsize != windowSizeInSamples/8:
    #    #print "Overriding given hopsize to use 1/8th of window size"
    #    #hopsize = windowSizeInSamples/8
    #    warnings.warn("Chosen hopsize: "+str(hopsize)+\
    #                  ", while windowsize: "+str(windowSizeInSamples))

    options.hopsizeInSamples = hopsize
    if options.fourierSize is None:
        NFT = windowSizeInSamples
    else:
        NFT = options.fourierSize

    # number of iterations for each parameter estimation step:
    niter = options.nbiter
    # number of spectral shapes for the accompaniment
    R = int(options.R)

    eps = 10**-9

    if options.verbose:
        print "Some parameter settings:"
        print "    Size of analysis windows: ", windowSizeInSamples
        print "    Hopsize: ", hopsize
        print "    Size of Fourier transforms: ", NFT
        print "    Number of iterations to be done: ", niter
        print "    Number of elements in WM: ", R

    if is_stereo:
        XR, F, N = stft(data[:, 0],
                        fs=Fs,
                        hopsize=hopsize,
                        window=sinebell(windowSizeInSamples),
                        nfft=NFT)
        XL, F, N = stft(data[:, 1],
                        fs=Fs,
                        hopsize=hopsize,
                        window=sinebell(windowSizeInSamples),
                        nfft=NFT)
        # SX is the power spectrogram:
        ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
        ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
        #SXR = np.abs(XR) ** 2
        #SXL = np.abs(XL) ** 2
        SX = np.maximum((0.5 * np.abs(XR + XL))**2, eps)
    else:  # data is mono
        X, F, N = stft(data,
                       fs=Fs,
                       hopsize=hopsize,
                       window=sinebell(windowSizeInSamples),
                       nfft=NFT)
        SX = np.maximum(np.abs(X)**2, eps)

    del data, F, N

    # minimum and maximum F0 in glottal source spectra dictionary
    minF0 = options.minF0
    maxF0 = options.maxF0
    F, N = SX.shape
    stepNotes = options.stepNotes  # this is the number of F0s within one semitone

    K = int(
        options.K_numFilters)  # number of spectral shapes for the filter part
    P = int(options.P_numAtomFilters
            )  # number of elements in dictionary of smooth filters
    chirpPerF0 = 1  # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0:
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :]  # ensure same size as SX
    NF0 = F0Table.size  # number of harmonic combs
    # Normalization:
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)

    if options.sal_output_file is None or not os.path.exists(
            options.sal_output_file):
        if displayEvolution:
            plt.figure(1)
            plt.clf()
            plt.xticks(fontsize=16)
            plt.yticks(fontsize=16)
            plt.xlabel(r'Frame number $n$', fontsize=16)
            plt.ylabel(r'Leading source number $u$', fontsize=16)
            plt.ion()
            # plt.show()
            ## the following seems superfluous if mpl's backend is macosx...
            ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
            ##                  "!! Press Return to resume the program. !!\n"\
            ##                  "!! Be sure that the figure has been    !!\n"\
            ##                  "!! already displayed, so that the      !!\n"\
            ##                  "!! evolution of HF0 will be visible.   !!\n"\
            ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

        ## section to estimate the melody, on monophonic algo:
        # First round of parameter estimation:
        if (HPS):
            from scipy.signal import medfilt
        if (is_stereo & stereoEstimation):
            SXR = np.maximum(np.abs(XR)**2, eps)
            SXL = np.maximum(np.abs(XL)**2, eps)
            if (HPS):
                SXR = medfilt(SXR, 3)
                SXL = medfilt(SXL, 3)

            alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError1 = SIMM.Stereo_SIMM(
                # the data to be fitted to:
                SXR,
                SXL,
                # the basis matrices for the spectral combs
                WF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=None,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution)
        else:
            if (HPS):
                SX = medfilt(SX, 3)

            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=None,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution)
        if displayEvolution:
            h2 = plt.figure(2)
            plt.clf()
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])

    else:
        print "Loading Salience from file to calculate Melody: " + options.sal_output_file
        loaded = np.loadtxt(options.sal_output_file).T
        times = [loaded[0, :]]
        HF0 = loaded[1:, :]

    # If vit_pitch_output_file is not null, do melody extraction with Viterbi
    if not (options.vit_pitch_output_file is None):
        print "Viterbi decoding"
        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        # create transition probability matrix - adhoc parameter 'scale'
        # TODO: use "learned" parameter scale (NB: after many trials,
        # provided scale and parameterization seems robust)
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]

        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1])  # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10**(-90)
        p0_0 = transitions[cutoffnote - 1] * 10**(-100)
        p0_f = transitions[cutoffnote - 1] * 10**(-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0

        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))

        # prior probabilities, and setting the array for Viterbi tracking:
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)

        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100)

        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)

        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')

        del logHF0

        # detection of silences:
        # computing the melody restricted F0 amplitude matrix HF00
        # (which will be used as initial HF0 for further algo):
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        # computing indices for and around the melody indices,
        # dim1index are indices along axis 0, and dim2index along axis 1
        # of HF0:
        #     TODO: use numpy broadcasting to make this "clearer" (if possible...)
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * int(np.floor(stepNotes / scopeAllowedHF0)) \
                                        + 1))) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 \
                                         * int(np.floor(stepNotes / scopeAllowedHF0)),
                                         chirpPerF0 \
                                         * int((np.floor(stepNotes / scopeAllowedHF0))) \
                                            + 1)),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, N * chirpPerF0 \
                               * (2 * int(np.floor(stepNotes / scopeAllowedHF0)) \
                                  + 1))
        dim2index = np.outer(np.arange(N),
                             np.ones(chirpPerF0 \
                                     * (2 * int(np.floor(stepNotes \
                                                     / scopeAllowedHF0)) + 1), \
                                     dtype=int)\
                             ).reshape(1, N * chirpPerF0 \
                                       * (2 * int(np.floor(stepNotes \
                                                       / scopeAllowedHF0)) \
                                          + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        # remove frames with less than (100 thres_energy) % of total energy.
        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)

        energyMel = np.sum((((SPHI * SF0) / hatSX)**2) * SX, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N
        if not os.path.isdir(os.path.dirname((options.vit_pitch_output_file))):
            os.mkdir(os.path.dirname((options.vit_pitch_output_file)))

        np.savetxt(options.vit_pitch_output_file + '.egy',
                   np.array(
                       [np.arange(N) * hopsize / np.double(Fs), energyMel]).T,
                   fmt='%10.5f')

        # energyMel <= energyMelCumul[ind_999]?

        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])

        # edit: frames predicted as unvoiced will be given negative values
        # indexBestPath[melNotPresent] = 0

        freqMelody = F0Table[np.array(np.minimum(indexBestPath,
                                                 len(F0Table) - 1),
                                      dtype=int)]
        freqMelody[melNotPresent] = -freqMelody[melNotPresent]

        if not os.path.exists(os.path.dirname(options.vit_pitch_output_file)):
            os.makedirs(os.path.dirname(options.vit_pitch_output_file))

        np.savetxt(options.vit_pitch_output_file,
                   np.array(
                       [np.arange(N) * hopsize / np.double(Fs), freqMelody]).T,
                   fmt='%10.7f')

    times = np.array([np.arange(N) * hopsize / np.double(Fs)])

    # Save salience file:
    if not (options.sal_output_file is None):
        if not os.path.exists(os.path.dirname(options.sal_output_file)):
            os.makedirs(os.path.dirname(options.sal_output_file))
        np.savetxt(options.sal_output_file,
                   np.concatenate((times, HF0), axis=0).T,
                   fmt='%10.6f')
        # saveSPHI (timbre related)
        saveSPHI = 0
        if saveSPHI:
            if not os.path.exists(
                    os.path.dirname(options.sal_output_file + '.SPHI')):
                os.makedirs(os.path.dirname(options.sal_output_file))
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            np.savetxt(options.sal_output_file + '.SPHI',
                       np.concatenate((times, SPHI), axis=0).T,
                       fmt='%10.4f')
        #np.savetxt(options.sal_output_file+'.WGAMMA',np.concatenate((times,WGAMMA),axis=0).T,fmt='%10.4f')

    # return times[0],freqMelody,HF0
    print "Done!"
    return times[0], HF0, options

예제 #5

파일 보기

파일: separateLeadStereoParam.py 프로젝트: wslihgt/separateLeadStereo

def main():
    import optparse

    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option(
        "-v",
        "--vocal-output-file",
        dest="voc_output_file",
        type="string",
        help="name of the audio output file for the estimated\n"
        "solo (vocal) part. \n"
        "If None, appends _lead to inputAudioFile.",
        default=None,
    )
    parser.add_option(
        "-m",
        "--music-output-file",
        dest="mus_output_file",
        type="string",
        help="name of the audio output file for the estimated\n"
        "music part.\n"
        "If None, appends _acc to inputAudioFile.",
        default=None,
    )
    parser.add_option(
        "-p",
        "--pitch-output-file",
        dest="pitch_output_file",
        type="string",
        help="name of the output file for the estimated pitches.\n" "If None, appends _pitches to inputAudioFile",
        default=None,
    )

    # Some more optional options:
    parser.add_option(
        "-d", "--with-display", dest="displayEvolution", action="store_true", help="display the figures", default=False
    )
    parser.add_option(
        "-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True
    )
    parser.add_option(
        "-n",
        "--dontseparate",
        dest="separateSignals",
        action="store_false",
        help="Trigger this option if you only desire to " + "estimate the melody",
        default=True,
    )
    parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=30)
    parser.add_option(
        "--window-size", dest="windowSize", type="float", default=0.04644, help="size of analysis windows, in s."
    )
    parser.add_option(
        "--Fourier-size",
        dest="fourierSize",
        type="int",
        default=None,
        help="size of Fourier transforms, " "in samples.",
    )
    parser.add_option(
        "--hopsize",
        dest="hopsize",
        type="float",
        default=0.0058,
        help="size of the hop between analysis windows, in s.",
    )
    parser.add_option(
        "--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment."
    )

    parser.add_option(
        "--with-melody",
        dest="melody",
        type="string",
        default=None,
        help="provide the melody in a file named MELODY, " "with at each line: <time (s)><F0 (Hz)>.",
    )

    parser.add_option(
        "--numAtomFilters",
        dest="P_numAtomFilters",
        type="int",
        default=30,
        help="Number of atomic filters - in WGAMMA.",
    )
    parser.add_option(
        "--numFilters",
        dest="K_numFilters",
        type="int",
        default=10,
        help="Number of filters for decomposition - in WPHI",
    )
    parser.add_option(
        "--min-F0-Freq", dest="minF0", type="float", default=100.0, help="Minimum of fundamental frequency F0."
    )
    parser.add_option(
        "--max-F0-Freq", dest="maxF0", type="float", default=800.0, help="Maximum of fundamental frequency F0."
    )
    parser.add_option(
        "--step-F0s", dest="stepNotes", type="int", default=20, help="Number of F0s in dictionary for each semitone."
    )

    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        ## plt.rc('text', usetex=True)
        plt.rc("image", cmap="jet")  ## gray_r
        plt.ion()

    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    if inputAudioFile[-4:] != ".wav":
        raise ValueError("File not WAV file? Only WAV format support, for now...")

    if options.mus_output_file is None:
        options.mus_output_file = inputAudioFile[:-4] + "_acc.wav"

    if options.voc_output_file is None:
        options.voc_output_file = inputAudioFile[:-4] + "_lead.wav"

    if options.pitch_output_file is None:
        options.pitch_output_file = inputAudioFile[:-4] + "_pitches.txt"

    print "Writing the different following output files:"
    print "    separated lead          in", options.voc_output_file
    print "    separated accompaniment in", options.mus_output_file
    print "    separated lead + unvoc  in", options.voc_output_file[:-4] + "_VUIMM.wav"
    print "    separated acc  - unvoc  in", options.mus_output_file[:-4] + "_VUIMM.wav"
    print "    estimated pitches       in", options.pitch_output_file

    Fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max()  # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData  # makes data vary from -1 to 1
    is_stereo = True
    if data.shape[0] == data.size:  # data is multi-channel
        print "The audio file is not stereo. Making stereo out of mono."
        print "(You could also try the older separateLead.py...)"
        is_stereo = False
        # data = np.vstack([data,data]).T
        # raise ValueError("number of dimensions of the input not 2")
    if is_stereo and data.shape[1] != 2:
        print "The data is multichannel, but not stereo... \n"
        print "Unfortunately this program does not scale well. Data is \n"
        print "reduced to its 2 first channels.\n"
        data = data[:, 0:2]

    # Processing the options:
    windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs))

    hopsize = np.round(options.hopsize * Fs)
    if hopsize != windowSizeInSamples / 8:
        # print "Overriding given hopsize to use 1/8th of window size"
        # hopsize = windowSizeInSamples/8
        warnings.warn("Chosen hopsize: " + str(hopsize) + ", while windowsize: " + str(windowSizeInSamples))

    if options.fourierSize is None:
        NFT = windowSizeInSamples
    else:
        NFT = options.fourierSize

    # number of iterations for each parameter estimation step:
    niter = options.nbiter
    # number of spectral shapes for the accompaniment
    R = options.R

    eps = 10 ** -9

    if options.verbose:
        print "Some parameter settings:"
        print "    Size of analysis windows: ", windowSizeInSamples
        print "    Hopsize: ", hopsize
        print "    Size of Fourier transforms: ", NFT
        print "    Number of iterations to be done: ", niter
        print "    Number of elements in WM: ", R

    if is_stereo:
        XR, F, N = stft(data[:, 0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT)
        XL, F, N = stft(data[:, 1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT)
        # SX is the power spectrogram:
        ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
        ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
        # SXR = np.abs(XR) ** 2
        # SXL = np.abs(XL) ** 2
        SX = np.maximum((0.5 * np.abs(XR + XL)) ** 2, eps)
    else:  # data is mono
        X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT)
        SX = np.maximum(np.abs(X) ** 2, eps)

    del data, F, N

    # TODO: also process these as options:
    # minimum and maximum F0 in glottal source spectra dictionary
    minF0 = options.minF0
    maxF0 = options.maxF0
    F, N = SX.shape
    stepNotes = options.stepNotes  # this is the number of F0s within one semitone

    K = options.K_numFilters  # number of spectral shapes for the filter part
    P = options.P_numAtomFilters  # number of elements in dictionary of smooth filters
    chirpPerF0 = 1  # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0:
    F0Table, WF0 = generate_WF0_chirped(
        minF0,
        maxF0,
        Fs,
        Nfft=NFT,
        stepNotes=stepNotes,
        lengthWindow=windowSizeInSamples,
        Ot=0.25,
        perF0=chirpPerF0,
        depthChirpInSemiTone=0.15,
        loadWF0=True,
        analysisWindow="sinebell",
    )
    WF0 = WF0[0:F, :]  # ensure same size as SX
    NF0 = F0Table.size  # number of harmonic combs
    # Normalization:
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale="linear", numberOfBasis=P, overlap=0.75)

    if displayEvolution:
        plt.figure(1)
        plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r"Frame number $n$", fontsize=16)
        plt.ylabel(r"Leading source number $u$", fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K,
            numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for
            HGAMMA0=None,
            HPHI0=None,
            HF00=None,
            WM0=None,
            HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter,
            updateRulePower=1.0,
            stepNotes=stepNotes,
            lambdaHF0=0.0 / (1.0 * SX.max()),
            alphaHF0=0.9,
            verbose=options.verbose,
            displayEvolution=displayEvolution,
        )

        if displayEvolution:
            h2 = plt.figure(2)
            plt.clf()
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])

        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        # create transition probability matrix - adhoc parameter 'scale'
        # TODO: use "learned" parameter scale (NB: after many trials,
        # provided scale and parameterization seems robust)
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]

        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1])  # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = transitions[
            np.array(np.abs(np.outer(np.ones(NF0), b) - np.outer(b, np.ones(NF0))), dtype=int)
        ]
        pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
        p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
        p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0

        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 / np.outer(sumTransitionMatrixF0, np.ones(NF0 + 1))

        # prior probabilities, and setting the array for Viterbi tracking:
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)

        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100)

        indexBestPath = viterbiTrackingArray(
            logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose
        )

        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, "-b")
            h2.hold(False)
            plt.axis("tight")

        del logHF0

        # detection of silences:
        # computing the melody restricted F0 amplitude matrix HF00
        # (which will be used as initial HF0 for further algo):
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        # computing indices for and around the melody indices,
        # dim1index are indices along axis 0, and dim2index along axis 1
        # of HF0:
        #     TODO: use numpy broadcasting to make this "clearer" (if possible...)
        dim1index = np.array(
            np.maximum(
                np.minimum(
                    np.outer(
                        chirpPerF0 * indexBestPath,
                        np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)),
                    )
                    + np.outer(
                        np.ones(N),
                        np.arange(
                            -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                            chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1),
                        ),
                    ),
                    chirpPerF0 * NF0 - 1,
                ),
                0,
            ),
            dtype=int,
        ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))
        dim2index = np.outer(
            np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int)
        ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        # remove frames with less than (100 thres_energy) % of total energy.
        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum((((SPHI * SF0) / hatSX) ** 2) * SX, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N

        melNotPresent = energyMel <= energyMelCumulNorm[ind_999]
        indexBestPath[melNotPresent] = 0

    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:
            print "The melody should be provided as <Time (s)><F0 (Hz)>."
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:, 0]  # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:, 1]
        if minF0 > melFreqHz[melFreqHz > 40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz > 40.0].min() * 0.97
            maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03)
            print "Recomputing the source basis for "
            print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz."
            # Create the harmonic combs, for each F0 between minF0 and maxF0:
            F0Table, WF0 = generate_WF0_chirped(
                minF0,
                maxF0,
                Fs,
                Nfft=NFT,
                stepNotes=stepNotes,
                lengthWindow=windowSizeInSamples,
                Ot=0.25,
                perF0=chirpPerF0,
                depthChirpInSemiTone=0.15,
            )
            WF0 = WF0[0:F, :]  # ensure same size as SX
            NF0 = F0Table.size  # number of harmonic combs
            # Normalization:
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(
            np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))
        )
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps, range(N)] >= 0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath <= 0] = 0

    freqMelody = F0Table[np.array(indexBestPath, dtype=int)]
    freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0]
    np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T)

    # If separation is required:
    if options.separateSignals:
        # Second round of parameter estimation, with specific
        # initial HF00:
        HF00 = np.zeros([NF0 * chirpPerF0, N])

        scopeAllowedHF0 = 2.0 / 1.0

        # indexes for HF00:
        # TODO: reprogram this with a 'where'?...
        dim1index = np.array(
            np.maximum(
                np.minimum(
                    np.outer(
                        chirpPerF0 * indexBestPath,
                        np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)),
                    )
                    + np.outer(
                        np.ones(N),
                        np.arange(
                            -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                            chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1),
                        ),
                    ),
                    chirpPerF0 * NF0 - 1,
                ),
                0,
            ),
            dtype=int,
        )
        dim1index = dim1index[indexBestPath != 0, :]
        ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
        ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
        ##                          + 1))
        dim1index = dim1index.reshape(1, dim1index.size)

        dim2index = np.outer(
            np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int)
        )
        dim2index = dim2index[indexBestPath != 0, :]
        dim2index = dim2index.reshape(1, dim2index.size)
        ## dim2index.reshape(1, N * chirpPerF0 \
        ##                                * (2 * np.floor(stepNotes \
        ##                                                / scopeAllowedHF0) \
        ##                                   + 1))
        HF00[dim1index, dim2index] = 1  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        WF0effective = WF0
        HF00effective = HF00

        if options.melody is None:
            del HF0, HGAMMA, HPHI, HM, WM, HF00

        if is_stereo:
            del SX
            SXR = np.maximum(np.abs(XR) ** 2, eps)
            SXL = np.maximum(np.abs(XL) ** 2, eps)
            alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
                # the data to be fitted to:
                SXR,
                SXL,
                # the basis matrices for the spectral combs
                WF0effective,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=HF00effective,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SXR.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)

            hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM)
            hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM)

            hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR

            vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL

            vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
            #                          options.voc_output_file, Fs)

            vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, np.array([vestR, vestL]).T)

            # wav.write(options.voc_output_file, Fs, \
            #          np.int16(32768.0 * np.array([vestR,vestL]).T))

            hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR

            mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL

            mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            # scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
            #                          options.mus_output_file, Fs)

            mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, np.array([mestR, mestL]).T)

            # wav.write(options.mus_output_file, Fs, \
            #          np.int16(32768.0 * np.array([mestR,mestL]).T))

            del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0

            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?

            alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
                # the data to be fitted to:
                SXR,
                SXL,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # if any, initial amplitude matrices for
                HGAMMA0=HGAMMA,
                HPHI0=HPHI,
                HF00=HUF0,
                WM0=None,  # WM,
                HM0=None,  # HM,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SXR.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
                updateHGAMMA=False,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)

            hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM)
            hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM)

            hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR

            vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL

            vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav"

            vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, np.array([vestR, vestL]).T)

            hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR

            mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL

            mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav"

            mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, np.array([mestR, mestL]).T)
        else:
            # running on monophonic data:
            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WF0effective,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=HF00effective,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)
            SM = np.dot(WM, HM)

            hatSX = SF0 * SPHI + SM

            hatV = SPHI * SF0 / hatSX * X

            vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            vest = np.array(np.round(vest * scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, vest)

            hatM = SM / hatSX * X

            mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            mest = np.array(np.round(mest * scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, mest)

            del hatM, vest, mest, hatV, hatSX, SPHI, SF0

            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?

            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=HGAMMA,
                HPHI0=HPHI,
                HF00=HUF0,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
                updateHGAMMA=False,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)
            SM = np.dot(WM, HM)

            hatSX = SF0 * SPHI + SM

            hatV = SPHI * SF0 / hatSX * X

            vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            vest = np.array(np.round(vest * scaleData), dtype=dataType)
            outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav"
            wav.write(outputFileName, Fs, vest)

            hatM = SM / hatSX * X

            mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            mest = np.array(np.round(mest * scaleData), dtype=dataType)

            outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav"
            wav.write(outputFileName, Fs, mest)

        if displayEvolution:
            plt.close("all")

    print "Done!"

예제 #6

파일 보기

파일: SeparateLeadStereo.py 프로젝트: wslihgt/sepLeadPySide

    def runViterbi(self):
        if not('HF0' in self.SIMMParams.keys()):
            raise AttributeError("HF0 has probably not been estimated yet.")
        
        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        scale = 1.0
        NF0 = self.SIMMParams['NF0']
        transitions = np.exp(-np.floor(np.arange(0, NF0)/\
                                       self.SIMMParams['stepNotes']) * \
                             scale)
        cutoffnote = 2 * 5 * self.SIMMParams['stepNotes']
        transitions[cutoffnote:] = transitions[cutoffnote - 1]
        
        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
            transitions[\
                np.array(np.abs(np.outer(np.ones(NF0), b) \
                                - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
        p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
        p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0
        
        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))
        
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, self.N])
        normHF0 = np.amax(self.SIMMParams['HF0'], axis=0)
        barHF0 = np.array(self.SIMMParams['HF0'])
        
        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100)
        
        print "Running Viterbi algorithm to track the melody, " + \
              str(self.N) + " frames."
        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=False)
        print "Viterbi algorithm done..."

        # drawing this as a line is actually a bit confusing, on the image
        #     TODO: think of a better representation (is contour good enough?)
        ##if self.displayEvolution and not(self.imageCanvas is None):
        ##    self.imageCanvas.ax.plot(indexBestPath, '-b')
        ##    self.imageCanvas.ax.axis('tight')
        ##    self.imageCanvas.draw()
        
        del logHF0
        
        # detection of silences:
        chirpPerF0 = self.SIMMParams['chirpPerF0']
        stepNotes = self.SIMMParams['stepNotes']
        HF00 = np.zeros([NF0 * chirpPerF0, self.N])
        scopeAllowedHF0 = self.scopeAllowedHF0# 4.0 / 1.0 # 2.0 / 1.0
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * np.floor(stepNotes / scopeAllowedHF0) \
                                        + 1))) \
                    + np.outer(np.ones(self.N),
                               np.arange(-chirpPerF0 \
                                         * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 \
                                         * (np.floor(stepNotes / scopeAllowedHF0) \
                                            + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, self.N * chirpPerF0 \
                               * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                                  + 1))
        dim2index = np.outer(np.arange(self.N),
                             np.ones(chirpPerF0 \
                                     * (2 * np.floor(stepNotes \
                                                     / scopeAllowedHF0) + 1), \
                                     dtype=int)\
                             ).reshape(1, self.N * chirpPerF0 \
                                       * (2 * np.floor(stepNotes \
                                                       / scopeAllowedHF0) \
                                          + 1))
        HF00[dim1index, dim2index] = self.SIMMParams['HF0'][dim1index, dim2index]
        
        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0
        
        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(self.SIMMParams['WF0'], HF00), eps)
        SPHI = np.maximum(np.dot(self.SIMMParams['WGAMMA'], \
                                 np.dot(self.SIMMParams['HGAMMA'],
                                        self.SIMMParams['HPHI'])), eps)
        SM = np.maximum(np.dot(self.SIMMParams['WM'], \
                               self.SIMMParams['HM']), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \
                                  (self.XR + self.XL) * 0.5) \
                           ** 2, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = self.N
        
        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0
        
        freqMelody = self.SIMMParams['F0Table'][np.array(indexBestPath,dtype=int)]
        freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0]
        np.savetxt(self.files['pitch_output_file'],
                   np.array([np.arange(self.N) * \
                             self.stftParams['hopsize'] / np.double(self.fs),
                             freqMelody]).T)
        
        self.indexBestPath = indexBestPath
        self.freqMelody = freqMelody

예제 #7

파일 보기

파일: separateLead.py 프로젝트: Wuther/YW_Research

def main():
    import optparse
    
    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\nsolo (vocal) part",
                      default="estimated_solo.wav")
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\nmusic part",
                      default="estimated_music.wav")
    parser.add_option("-p", "--pitch-output-file",
                      dest="pitch_output_file", type="string",
                      help="name of the output file for the estimated pitches",
                      default="pitches.txt")

    # Some more optional options:
    parser.add_option("-d", "--with-display", dest="displayEvolution",
                      action="store_true",help="display the figures",
                      default=False)
    parser.add_option("-q", "--quiet", dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    parser.add_option("--nb-iterations", dest="nbiter",
                      help="number of iterations", type="int",
                      default=50)
    parser.add_option("--window-size", dest="windowSize", type="float",
                      default=0.04644,help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=2048, help="size of Fourier transforms, in samples.")
    parser.add_option("--hopsize", dest="hopsize", type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")

    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        plt.rc('text', usetex=True)
        plt.rc('image',cmap='gray_r')
        plt.ion()

    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    fs, data = wav.read(inputAudioFile)
    #data, fs, enc = scikits.audiolab.wavread(inputAudioFile)
    if data.shape[0] != data.size: # data is multi-channel
        data = np.mean(data,axis=1)

    # Processing the options:
    windowSizeInSamples = np.round(options.windowSize * fs)
    hopsize = np.round(options.hopsize * fs)
    NFT = options.fourierSize
    niter = options.nbiter

    if options.verbose:
        print "Size of analysis windows: ", windowSizeInSamples, "\n"
        print "Hopsize: ", hopsize, "\n"
        print "Size of Fourier transforms: ", NFT, "\n"
        print "Number of iterations to be done: ", niter, "\n"
    
    X, F, N = stft(data, fs=fs, hopsize=hopsize,
                   window=sinebell(windowSizeInSamples), nfft=NFT)
    # SX is the power spectrogram:
    SX = np.maximum(np.abs(X) ** 2, 10 ** -8)

    del data, F, N

    # TODO: also process these as options:
    minF0 = 100
    maxF0 = 800
    Fs = fs
    F, N = SX.shape
    stepNotes = 20 # this is the number of F0s within one semitone
    K = 50 # number of spectral shapes for the filter part
    R = 40 # number of spectral shapes for the accompaniment
    P = 30 # number of elements in dictionary of smooth filters
    chirpPerF0 = 1 # number of chirped spectral shapes between each F0
                   # this feature should be further studied before
                   # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0: 
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=2048, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=2048, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15)
    WF0 = WF0[0:F, :] # ensure same size as SX 
    NF0 = F0Table.size # number of harmonic combs
    # Normalization: 
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, 2048, Fs=fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)

    if displayEvolution:
        plt.figure(1);plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        raw_input("Press Return to resume the program. \nBe sure that the figure has been already displayed, so that the evolution of HF0 will be visible. ")

    # First round of parameter estimation:
    HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
        # the data to be fitted to:
        SX,
        # the basis matrices for the spectral combs
        WF0,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # putting only 2 elements in accompaniment for a start...
        # if any, initial amplitude matrices for 
        HGAMMA0=None, HPHI0=None,
        HF00=None,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)

    if displayEvolution:
        plt.figure(3);plt.clf()
        plt.subplot(221)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(222)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:,1])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(223)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(224)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")

        plt.figure(4);plt.clf()
        imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(5);plt.clf()
        imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(6);plt.clf()
        imageMatlab.imageM(db(np.dot(WM, HM)), vmin=-50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(7);plt.clf()
        imageMatlab.imageM(db(WM), vmin=-50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Element number $r$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        
    if displayEvolution:
        h2 = plt.figure(2);plt.clf();
        imageMatlab.imageM(20 * np.log10(HF0))
        matMax = (20 * np.log10(HF0)).max()
        matMed = np.median(20 * np.log10(HF0))
        plt.clim([matMed - 100, matMax])

    # Viterbi decoding to estimate the predominant fundamental
    # frequency line
    scale = 1.0
    transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale)
    cutoffnote = 2 * 5 * stepNotes
    transitions[cutoffnote:] = transitions[cutoffnote - 1]

    transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix
    b = np.arange(NF0)
    transitionMatrixF0[0:NF0, 0:NF0] = \
                              transitions[\
        np.array(np.abs(np.outer(np.ones(NF0), b) \
                        - np.outer(b, np.ones(NF0))), dtype=int)]
    pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
    p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
    p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
    transitionMatrixF0[0:NF0, NF0] = pf_0
    transitionMatrixF0[NF0, 0:NF0] = p0_f
    transitionMatrixF0[NF0, NF0] = p0_0

    sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
    transitionMatrixF0 = transitionMatrixF0 \
                         / np.outer(sumTransitionMatrixF0, \
                                    np.ones(NF0 + 1))

    priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
    logHF0 = np.zeros([NF0 + 1, N])
    normHF0 = np.amax(HF0, axis=0)
    barHF0 = np.array(HF0)

    logHF0[0:NF0, :] = np.log(barHF0)
    logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf])
    logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100)

    indexBestPath = viterbiTrackingArray(\
        logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0))

    np.savetxt(options.pitch_output_file,
               np.array([np.arange(N)*options.hopsize,
                         F0Table[np.array(indexBestPath,dtype=int)]]).T)

    if displayEvolution:
        h2.hold(True)
        plt.plot(indexBestPath, '-b')
        h2.hold(False)
        plt.axis('tight')
        raw_input("Press Return to resume the program...")

    del logHF0

    # Second round of parameter estimation, with specific
    # initial HF00:
    HF00 = np.zeros([NF0 * chirpPerF0, N])

    scopeAllowedHF0 = 1.0 / 1.0

    # indexes for HF00:
    # TODO: reprogram this with a 'where'?...
    dim1index = np.array(\
        np.maximum(\
        np.minimum(\
        np.outer(chirpPerF0 * indexBestPath,
                 np.ones(chirpPerF0 \
                         * (2 \
                            * np.floor(stepNotes / scopeAllowedHF0) \
                            + 1))) \
        + np.outer(np.ones(N),
                   np.arange(-chirpPerF0 \
                             * np.floor(stepNotes / scopeAllowedHF0),
                             chirpPerF0 \
                             * (np.floor(stepNotes / scopeAllowedHF0) \
                                + 1))),
        chirpPerF0 * NF0 - 1),
        0),
        dtype=int).reshape(1, N * chirpPerF0 \
                           * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                              + 1))
    dim2index = np.outer(np.arange(N),
                         np.ones(chirpPerF0 \
                                 * (2 * np.floor(stepNotes \
                                                 / scopeAllowedHF0) + 1), \
                                 dtype=int)\
                         ).reshape(1, N * chirpPerF0 \
                                   * (2 * np.floor(stepNotes \
                                                   / scopeAllowedHF0) \
                                      + 1))
    HF00[dim1index, dim2index] = 1 # HF0.max()

    HF00[:, indexBestPath == (NF0 - 1)] = 0.0

    WF0effective = WF0
    HF00effective = HF00

    del HF0, HGAMMA, HPHI, HM, WM, HF00

    HGAMMA, HPHI, HF0, HM, WM, recoError2 = SIMM.SIMM(
        # the data to be fitted to:
        SX,
        # the basis matrices for the spectral combs
        WF0effective,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=None, HPHI0=None,
        HF00=HF00effective,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)

    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WF0effective, HF0)
    SM = np.dot(WM, HM)

    hatSX = SPHI * SF0 + SM

    hatV = SPHI * SF0 / hatSX * X

    vest = istft(hatV, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0

   # scikits.audiolab.wavwrite(vest, options.voc_output_file, fs)
    wav.write(options.voc_output_file, fs, \
              vest)
    hatM = SM / hatSX * X

    mest = istft(hatM, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0

    #scikits.audiolab.wavwrite(mest, options.mus_output_file, fs)
    wav.write(options.mus_output_file, fs, \
              mest)
    if displayEvolution:
        plt.figure(13);plt.clf()
        plt.subplot(221)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(222)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 1])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(223)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(224)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")

        plt.figure(14);plt.clf()
        imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(141);plt.clf()
        SVhat = db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)) \
                + db(np.dot(WF0, HF0))
        imageMatlab.imageM(SVhat, vmax=SVhat.max(),
                           vmin=SVhat.max() - 50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(15);plt.clf()
        imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0

        plt.figure(16)
        plt.clf()
        imageMatlab.imageM(db(np.dot(WM, HM)),
                           vmin=np.maximum(-50, db(SM.min())))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(17)
        plt.clf()
        imageMatlab.imageM(db(WM), vmin=-50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Element number $r$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        raw_input("Press Return to end the program...")
        print "Done!"

예제 #8

파일 보기

    def runViterbi(self):
        if not ('HF0' in self.SIMMParams.keys()):
            raise AttributeError("HF0 has probably not been estimated yet.")

        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        scale = 1.0
        NF0 = self.SIMMParams['NF0']
        transitions = np.exp(-np.floor(np.arange(0, NF0)/\
                                       self.SIMMParams['stepNotes']) * \
                             scale)
        cutoffnote = 2 * 5 * self.SIMMParams['stepNotes']
        transitions[cutoffnote:] = transitions[cutoffnote - 1]

        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1])  # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
            transitions[\
                np.array(np.abs(np.outer(np.ones(NF0), b) \
                                - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10**(-90)
        p0_0 = transitions[cutoffnote - 1] * 10**(-100)
        p0_f = transitions[cutoffnote - 1] * 10**(-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0

        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))

        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, self.N])
        normHF0 = np.amax(self.SIMMParams['HF0'], axis=0)
        barHF0 = np.array(self.SIMMParams['HF0'])

        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100)

        print "Running Viterbi algorithm to track the melody, " + \
              str(self.N) + " frames."
        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=False)
        print "Viterbi algorithm done..."

        # drawing this as a line is actually a bit confusing, on the image
        #     TODO: think of a better representation (is contour good enough?)
        ##if self.displayEvolution and not(self.imageCanvas is None):
        ##    self.imageCanvas.ax.plot(indexBestPath, '-b')
        ##    self.imageCanvas.ax.axis('tight')
        ##    self.imageCanvas.draw()

        del logHF0

        # detection of silences:
        chirpPerF0 = self.SIMMParams['chirpPerF0']
        stepNotes = self.SIMMParams['stepNotes']
        HF00 = np.zeros([NF0 * chirpPerF0, self.N])
        scopeAllowedHF0 = self.scopeAllowedHF0  # 4.0 / 1.0 # 2.0 / 1.0
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * np.floor(stepNotes / scopeAllowedHF0) \
                                        + 1))) \
                    + np.outer(np.ones(self.N),
                               np.arange(-chirpPerF0 \
                                         * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 \
                                         * (np.floor(stepNotes / scopeAllowedHF0) \
                                            + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, self.N * chirpPerF0 \
                               * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                                  + 1))
        dim2index = np.outer(np.arange(self.N),
                             np.ones(chirpPerF0 \
                                     * (2 * np.floor(stepNotes \
                                                     / scopeAllowedHF0) + 1), \
                                     dtype=int)\
                             ).reshape(1, self.N * chirpPerF0 \
                                       * (2 * np.floor(stepNotes \
                                                       / scopeAllowedHF0) \
                                          + 1))
        HF00[dim1index, dim2index] = self.SIMMParams['HF0'][dim1index,
                                                            dim2index]

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(self.SIMMParams['WF0'], HF00), eps)
        SPHI = np.maximum(np.dot(self.SIMMParams['WGAMMA'], \
                                 np.dot(self.SIMMParams['HGAMMA'],
                                        self.SIMMParams['HPHI'])), eps)
        SM = np.maximum(np.dot(self.SIMMParams['WM'], \
                               self.SIMMParams['HM']), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \
                                  (self.XR + self.XL) * 0.5) \
                           ** 2, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = self.N

        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0

        freqMelody = self.SIMMParams['F0Table'][np.array(indexBestPath,
                                                         dtype=int)]
        freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0]
        np.savetxt(self.files['pitch_output_file'],
                   np.array([np.arange(self.N) * \
                             self.stftParams['hopsize'] / np.double(self.fs),
                             freqMelody]).T)

        self.indexBestPath = indexBestPath
        self.freqMelody = freqMelody