Пример #1
0
def train_auto(train,
               fun,
               transform,
               testdir,
               outdir,
               testfile_list,
               testdir1,
               outdir1,
               testfile_list1,
               num_epochs=30,
               model="1.pkl",
               scale_factor=0.3,
               load=False,
               skip_train=False,
               skip_sep=False):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    train : Callable, e.g. LargeDataset object
        The callable which generates training data for the network: inputs, target = train()
    fun : lasagne network object, Theano tensor
        The network to be trained
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network)
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')

    eps = 1e-18
    alpha = 0.001

    network2 = fun(input_var=input_var2,
                   batch_size=train.batch_size,
                   time_context=train.time_context,
                   feat_size=train.input_size)

    if load:
        params = load_model(model)
        lasagne.layers.set_all_param_values(network2, params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(train.batch_size, 1, train.time_context,
                                       train.input_size))

    s1 = prediction2[:, 0:1, :, :]
    s2 = prediction2[:, 1:2, :, :]
    s3 = prediction2[:, 2:3, :, :]
    s4 = prediction2[:, 3:4, :, :]

    mask1 = s1 / (s1 + s2 + s3 + s4 + eps * rand_num)
    mask2 = s2 / (s1 + s2 + s3 + s4 + eps * rand_num)
    mask3 = s3 / (s1 + s2 + s3 + s4 + eps * rand_num)
    mask4 = s4 / (s1 + s2 + s3 + s4 + eps * rand_num)

    source1 = mask1 * input_var2[:, 0:1, :, :]
    source2 = mask2 * input_var2[:, 0:1, :, :]
    source3 = mask3 * input_var2[:, 0:1, :, :]
    source4 = mask4 * input_var2[:, 0:1, :, :]

    train_loss_recon1 = lasagne.objectives.squared_error(
        source1, target_var2[:, 0:1, :, :])
    train_loss_recon2 = lasagne.objectives.squared_error(
        source2, target_var2[:, 1:2, :, :])
    train_loss_recon3 = lasagne.objectives.squared_error(
        source3, target_var2[:, 2:3, :, :])
    train_loss_recon4 = lasagne.objectives.squared_error(
        source4, target_var2[:, 3:4, :, :])

    error1 = train_loss_recon1.sum()
    error2 = train_loss_recon2.sum()
    error3 = train_loss_recon3.sum()
    error4 = train_loss_recon4.sum()

    loss = abs(error1 + error2 + error3 + error4)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn = theano.function([input_var2, target_var2],
                               loss,
                               updates=updates,
                               allow_input_downcast=True)

    train_fn1 = theano.function([input_var2, target_var2],
                                [error1, error2, error3, error4],
                                allow_input_downcast=True)

    predict_function2 = theano.function([input_var2],
                                        [source1, source2, source3, source4],
                                        allow_input_downcast=True)

    losser = []

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            err1 = 0
            err2 = 0
            err3 = 0
            err4 = 0
            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()

                jump = inputs.shape[2]
                targets = np.ndarray(shape=(inputs.shape[0], 4,
                                            inputs.shape[1], inputs.shape[2]))
                inputs = np.reshape(
                    inputs,
                    (inputs.shape[0], 1, inputs.shape[1], inputs.shape[2]))

                targets[:, 0, :, :] = target[:, :, :jump]
                targets[:, 1, :, :] = target[:, :, jump:jump * 2]
                targets[:, 2, :, :] = target[:, :, jump * 2:jump * 3]
                targets[:, 3, :, :] = target[:, :, jump * 3:jump * 4]
                target = None
                #gc.collect()

                train_err += train_fn(inputs, targets)
                [e1, e2, e3, e4] = train_fn1(inputs, targets)
                err1 += e1
                err2 += e2
                err3 += e3
                err4 += e4
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err /
                                                             train_batches))
            logging.info("  training loss for bassoon:\t\t{:.6f}".format(
                err1 / train_batches))
            logging.info("  training loss for clarinet:\t\t{:.6f}".format(
                err2 / train_batches))
            logging.info("  training loss for saxophone:\t\t{:.6f}".format(
                err3 / train_batches))
            logging.info("  training loss for violin:\t\t{:.6f}".format(
                err4 / train_batches))
            losser.append(train_err / train_batches)
            save_model(model, network2)

    if not skip_sep:

        logging.info("Separating")
        sources = ['bassoon', 'clarinet', 'saxphone', 'violin']
        sources_midi = ['bassoon', 'clarinet', 'saxophone', 'violin']

        for f in testfile_list:
            for i in range(len(sources)):
                filename = os.path.join(testdir, f,
                                        f + '-' + sources[i] + '.wav')
                audioObj, sampleRate, bitrate = util.readAudioScipy(filename)

                assert sampleRate == 44100, "Sample rate needs to be 44100"

                nframes = int(np.ceil(
                    len(audioObj) / np.double(tt.hopSize))) + 2
                if i == 0:
                    audio = np.zeros(audioObj.shape[0])
                    #melody = np.zeros((len(sources),1,nframes))
                audio = audio + audioObj
                audioObj = None

            mag, ph = transform.compute_file(audio, phase=True)
            mag = scale_factor * mag.astype(np.float32)

            batches, nchunks = util.generate_overlapadd(
                mag,
                input_size=mag.shape[-1],
                time_context=train.time_context,
                overlap=train.overlap,
                batch_size=train.batch_size,
                sampleRate=44100)
            output = []
            #output1=[]

            batch_no = 1
            for batch in batches:
                batch_no += 1
                start_time = time.time()
                output.append(predict_function2(batch))

            output = np.array(output)
            mm = util.overlapadd_multi(output,
                                       batches,
                                       nchunks,
                                       overlap=train.overlap)
            for i in range(len(sources)):
                audio_out = transform.compute_inverse(
                    mm[i, :len(ph)] / scale_factor, ph)
                if len(audio_out) > len(audio):
                    audio_out = audio_out[:len(audio)]
                util.writeAudioScipy(
                    os.path.join(outdir, f + '-' + sources[i] + '.wav'),
                    audio_out, sampleRate, bitrate)
                audio_out = None

        style = ['fast', 'slow', 'original']
        if not os.path.exists(outdir1):
            os.makedirs(outdir1)
        for s in style:
            for f in testfile_list1:
                for i in range(len(sources)):
                    filename = os.path.join(
                        testdir1, f,
                        f + '_' + s + '_' + sources_midi[i] + '.wav')
                    audioObj, sampleRate, bitrate = util.readAudioScipy(
                        filename)

                    assert sampleRate == 44100, "Sample rate needs to be 44100"

                    nframes = int(
                        np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2

                    if i == 0:
                        audio = np.zeros(audioObj.shape[0])
                        #melody = np.zeros((len(sources),1,nframes))
                    audio = audio + audioObj
                    audioObj = None

                mag, ph = transform.compute_file(audio, phase=True)
                mag = scale_factor * mag.astype(np.float32)

                batches, nchunks = util.generate_overlapadd(
                    mag,
                    input_size=mag.shape[-1],
                    time_context=train.time_context,
                    overlap=train.overlap,
                    batch_size=train.batch_size,
                    sampleRate=44100)
                output = []

                batch_no = 1
                for batch in batches:
                    batch_no += 1
                    start_time = time.time()
                    output.append(predict_function2(batch))

                output = np.array(output)
                mm = util.overlapadd_multi(output,
                                           batches,
                                           nchunks,
                                           overlap=train.overlap)
                for i in range(len(sources)):
                    audio_out = transform.compute_inverse(
                        mm[i, :len(ph)] / scale_factor, ph)
                    if len(audio_out) > len(audio):
                        audio_out = audio_out[:len(audio)]
                    filename = os.path.join(
                        outdir1, f + '_' + s + '_' + sources_midi[i] + '.wav')
                    util.writeAudioScipy(filename, audio_out, sampleRate,
                                         bitrate)
                    audio_out = None

    return losser
Пример #2
0
def train_auto(train,
               fun,
               transform,
               testdir,
               outdir,
               num_epochs=30,
               model="1.pkl",
               scale_factor=0.3,
               load=False,
               skip_train=False,
               skip_sep=False):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    train : Callable, e.g. LargeDataset object
        The callable which generates training data for the network: inputs, target = train()
    fun : lasagne network object, Theano tensor
        The network to be trained  
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network) 
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')

    eps = 1e-8
    alpha = 0.001
    beta = 0.01
    beta_voc = 0.03

    network2 = fun(input_var=input_var2,
                   batch_size=train.batch_size,
                   time_context=train.time_context,
                   feat_size=train.input_size)

    if load:
        params = load_model(model)
        lasagne.layers.set_all_param_values(network2, params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(train.batch_size, 1, train.time_context,
                                       train.input_size))

    voc = prediction2[:, 0:1, :, :] + eps * rand_num
    bas = prediction2[:, 1:2, :, :] + eps * rand_num
    dru = prediction2[:, 2:3, :, :] + eps * rand_num
    oth = prediction2[:, 3:4, :, :] + eps * rand_num

    mask1 = voc / (voc + bas + dru + oth)
    mask2 = bas / (voc + bas + dru + oth)
    mask3 = dru / (voc + bas + dru + oth)
    mask4 = oth / (voc + bas + dru + oth)

    vocals = mask1 * input_var2
    bass = mask2 * input_var2
    drums = mask3 * input_var2
    others = mask4 * input_var2

    train_loss_recon_vocals = lasagne.objectives.squared_error(
        vocals, target_var2[:, 0:1, :, :])
    alpha_component = alpha * lasagne.objectives.squared_error(
        vocals, target_var2[:, 1:2, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        vocals, target_var2[:, 2:3, :, :])
    train_loss_recon_neg_voc = beta_voc * lasagne.objectives.squared_error(
        vocals, target_var2[:, 3:4, :, :])

    train_loss_recon_bass = lasagne.objectives.squared_error(
        bass, target_var2[:, 1:2, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        bass, target_var2[:, 0:1, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        bass, target_var2[:, 2:3, :, :])
    train_loss_recon_neg = beta * lasagne.objectives.squared_error(
        bass, target_var2[:, 3:4, :, :])

    train_loss_recon_drums = lasagne.objectives.squared_error(
        drums, target_var2[:, 2:3, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        drums, target_var2[:, 0:1, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        drums, target_var2[:, 1:2, :, :])
    train_loss_recon_neg += beta * lasagne.objectives.squared_error(
        drums, target_var2[:, 3:4, :, :])

    vocals_error = train_loss_recon_vocals.sum()
    drums_error = train_loss_recon_drums.sum()
    bass_error = train_loss_recon_bass.sum()
    negative_error = train_loss_recon_neg.sum()
    negative_error_voc = train_loss_recon_neg_voc.sum()
    alpha_component = alpha_component.sum()

    loss = abs(vocals_error + drums_error + bass_error - negative_error -
               alpha_component - negative_error_voc)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    # val_updates=lasagne.updates.nesterov_momentum(loss1, params1, learning_rate=0.00001, momentum=0.7)

    train_fn = theano.function([input_var2, target_var2],
                               loss,
                               updates=updates,
                               allow_input_downcast=True)

    train_fn1 = theano.function([input_var2, target_var2], [
        vocals_error, bass_error, drums_error, negative_error, alpha_component,
        negative_error_voc
    ],
                                allow_input_downcast=True)

    predict_function2 = theano.function([input_var2],
                                        [vocals, bass, drums, others],
                                        allow_input_downcast=True)

    losser = []
    loss2 = []

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            vocals_err = 0
            drums_err = 0
            bass_err = 0
            negative_err = 0
            alpha_component = 0
            beta_voc = 0
            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()
                jump = inputs.shape[2]
                inputs = np.reshape(
                    inputs,
                    (inputs.shape[0], 1, inputs.shape[1], inputs.shape[2]))
                targets = np.ndarray(shape=(inputs.shape[0], 4,
                                            inputs.shape[2], inputs.shape[3]))
                #import pdb;pdb.set_trace()
                targets[:, 0, :, :] = target[:, :, :jump]
                targets[:, 1, :, :] = target[:, :, jump:jump * 2]
                targets[:, 2, :, :] = target[:, :, jump * 2:jump * 3]
                targets[:, 3, :, :] = target[:, :, jump * 3:jump * 4]
                target = None

                train_err += train_fn(inputs, targets)
                [
                    vocals_erre, bass_erre, drums_erre, negative_erre, alpha,
                    betae_voc
                ] = train_fn1(inputs, targets)
                vocals_err += vocals_erre
                bass_err += bass_erre
                drums_err += drums_erre
                negative_err += negative_erre
                beta_voc += betae_voc
                alpha_component += alpha
                train_batches += 1

            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err /
                                                      train_batches))
            losser.append(train_err / train_batches)
            print("  training loss for vocals:\t\t{:.6f}".format(
                vocals_err / train_batches))
            print("  training loss for bass:\t\t{:.6f}".format(bass_err /
                                                               train_batches))
            print("  training loss for drums:\t\t{:.6f}".format(drums_err /
                                                                train_batches))
            print("  Beta component:\t\t{:.6f}".format(negative_err /
                                                       train_batches))
            print("  Beta component for voice:\t\t{:.6f}".format(
                beta_voc / train_batches))
            print("  alpha component:\t\t{:.6f}".format(alpha_component /
                                                        train_batches))
            losser.append(train_err / train_batches)
            save_model(model, network2)

    if not skip_sep:

        logging.info("Separating")
        source = ['vocals', 'bass', 'drums', 'other']
        dev_directory = os.listdir(os.path.join(testdir, "Dev"))
        test_directory = os.listdir(os.path.join(
            testdir, "Test"))  #we do not include the test dir
        dirlist = []
        dirlist.extend(dev_directory)
        dirlist.extend(test_directory)
        for f in sorted(dirlist):
            if not f.startswith('.'):
                if f in dev_directory:
                    song = os.path.join(testdir, "Dev", f, "mixture.wav")
                else:
                    song = os.path.join(testdir, "Test", f, "mixture.wav")
                audioObj, sampleRate, bitrate = util.readAudioScipy(song)

                assert sampleRate == 44100, "Sample rate needs to be 44100"

                audio = (audioObj[:, 0] + audioObj[:, 1]) / 2
                audioObj = None
                mag, ph = transform.compute_file(audio, phase=True)

                mag = scale_factor * mag.astype(np.float32)

                batches, nchunks = util.generate_overlapadd(
                    mag,
                    input_size=mag.shape[-1],
                    time_context=train.time_context,
                    overlap=train.overlap,
                    batch_size=train.batch_size,
                    sampleRate=sampleRate)
                output = []

                batch_no = 1
                for batch in batches:
                    batch_no += 1
                    start_time = time.time()
                    output.append(predict_function2(batch))

                output = np.array(output)
                mm = util.overlapadd_multi(output,
                                           batches,
                                           nchunks,
                                           overlap=train.overlap)

                #write audio files
                if f in dev_directory:
                    dirout = os.path.join(outdir, "Dev", f)
                else:
                    dirout = os.path.join(outdir, "Test", f)
                if not os.path.exists(dirout):
                    os.makedirs(dirout)
                for i in range(mm.shape[0]):
                    audio_out = transform.compute_inverse(
                        mm[i, :len(ph)] / scale_factor, ph)
                    if len(audio_out) > len(audio):
                        audio_out = audio_out[:len(audio)]
                    util.writeAudioScipy(
                        os.path.join(dirout, source[i] + '.wav'), audio_out,
                        sampleRate, bitrate)
                    audio_out = None
                audio = None

    return losser
Пример #3
0
def train_auto(train,fun,transform,testdir,outdir,num_epochs=30,model="1.pkl",scale_factor=0.3,load=False,skip_train=False,skip_sep=False):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    train : Callable, e.g. LargeDataset object
        The callable which generates training data for the network: inputs, target = train()
    fun : lasagne network object, Theano tensor
        The network to be trained  
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network) 
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')
    
    eps=1e-8
    alpha=0.9
    beta_acc=0.005
    beta_voc=0.02

    network2 = fun(input_var=input_var2,batch_size=train.batch_size,time_context=train.time_context,feat_size=train.input_size)
    
    if load:
        params=load_model(model)
        lasagne.layers.set_all_param_values(network2,params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(train.batch_size,1,train.time_context,train.input_size))

    voc=prediction2[:,0:1,:,:]+eps*rand_num
    acco=prediction2[:,1:2,:,:]+eps*rand_num

    mask1=voc/(voc+acco)
    mask2=acco/(voc+acco)

    vocals=mask1*input_var2[:,0:1,:,:]
    acc=mask2*input_var2[:,0:1,:,:]
    
    train_loss_recon_vocals = lasagne.objectives.squared_error(vocals,target_var2[:,0:1,:,:])
    train_loss_recon_acc = alpha * lasagne.objectives.squared_error(acc,target_var2[:,1:2,:,:])    
    train_loss_recon_neg_voc = beta_voc * lasagne.objectives.squared_error(vocals,target_var2[:,1:2,:,:])
    train_loss_recon_neg_acc = beta_acc * lasagne.objectives.squared_error(acc,target_var2[:,0:1,:,:])

    vocals_error=train_loss_recon_vocals.sum()  
    acc_error=train_loss_recon_acc.sum()  
    negative_error_voc=train_loss_recon_neg_voc.sum()
    negative_error_acc=train_loss_recon_neg_acc.sum()
    
    loss=abs(vocals_error+acc_error-negative_error_voc)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True)

    train_fn1 = theano.function([input_var2,target_var2], [vocals_error,acc_error,negative_error_voc,negative_error_acc], allow_input_downcast=True)

    predict_function2=theano.function([input_var2],[vocals,acc],allow_input_downcast=True)
    predict_function3=theano.function([input_var2],[prediction2[:,0:1,:,:],prediction2[:,1:2,:,:]],allow_input_downcast=True)

    losser=[]
    loss2=[]

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            vocals_err=0
            acc_err=0        
            beta_voc=0
            beta_acc=0
            start_time = time.time()
            for batch in range(train.iteration_size): 
                inputs, target = train()
                
                jump = inputs.shape[2]
                targets=np.ndarray(shape=(inputs.shape[0],2,inputs.shape[1],inputs.shape[2]))
                inputs=np.reshape(inputs,(inputs.shape[0],1,inputs.shape[1],inputs.shape[2]))          

                targets[:,0,:,:]=target[:,:,:jump]
                targets[:,1,:,:]=target[:,:,jump:jump*2]         
                target=None
        
                train_err+=train_fn(inputs,targets)
                [vocals_erre,acc_erre,betae_voc,betae_acc]=train_fn1(inputs,targets)
                vocals_err += vocals_erre
                acc_err += acc_erre           
                beta_voc+= betae_voc
                beta_acc+= betae_acc
                train_batches += 1
            
            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs, time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err/train_batches))
            logging.info("  training loss for vocals:\t\t{:.6f}".format(vocals_err/train_batches))
            logging.info("  training loss for acc:\t\t{:.6f}".format(acc_err/train_batches))
            logging.info("  Beta component for voice:\t\t{:.6f}".format(beta_voc/train_batches))
            logging.info("  Beta component for acc:\t\t{:.6f}".format(beta_acc/train_batches))
            losser.append(train_err / train_batches)
            save_model(model,network2)

    if not skip_sep:

        logging.info("Separating")
        for f in os.listdir(testdir):
            if f.endswith(".wav"):
                audioObj, sampleRate, bitrate = util.readAudioScipy(os.path.join(testdir,f))
                
                assert sampleRate == 44100,"Sample rate needs to be 44100"

                audio = audioObj[:,0] + audioObj[:,1]
                audioObj = None
                mag,ph=transform.compute_file(audio,phase=True)
         
                mag=scale_factor*mag.astype(np.float32)

                batches,nchunks = util.generate_overlapadd(mag,input_size=mag.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=sampleRate)
                output=[]

                batch_no=1
                for batch in batches:
                    batch_no+=1
                    start_time=time.time()
                    output.append(predict_function2(batch))

                output=np.array(output)
                bmag,mm=util.overlapadd(output,batches,nchunks,overlap=train.overlap)
                
                audio_out=transform.compute_inverse(bmag[:len(ph)]/scale_factor,ph)
                if len(audio_out)>len(audio):
                    audio_out=audio_out[:len(audio)]
                audio_out=essentia.array(audio_out)
                audio_out2= transform.compute_inverse(mm[:len(ph)]/scale_factor,ph) 
                if len(audio_out2)>len(audio):
                    audio_out2=audio_out2[:len(audio)]  
                audio_out2=essentia.array(audio_out2) 
                #write audio files
                util.writeAudioScipy(os.path.join(outdir,f.replace(".wav","-voice.wav")),audio_out,sampleRate,bitrate)
                util.writeAudioScipy(os.path.join(outdir,f.replace(".wav","-music.wav")),audio_out2,sampleRate,bitrate)
                audio_out=None 
                audio_out2=None   

    return losser  
Пример #4
0
def train_auto(fun,transform,testdir,outdir,testfile_list,testdir1,outdir1,testfile_list1,num_epochs=30,model="1.pkl",scale_factor=0.3,load=False,skip_train=False,skip_sep=False,
    path_transform_in=None,nsamples=40,batch_size=32, batch_memory=50, time_context=30, overlap=25, nprocs=4,mult_factor_in=0.3,mult_factor_out=0.3,timbre_model_path=None):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    fun : lasagne network object, Theano tensor
        The network to be trained
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network)
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')

    #parameters for the score-informed separation
    nharmonics=20
    interval=50 #cents
    tuning_freq=440 #Hz

    eps=1e-18
    alpha=0.001

    input_size = int(float(transform.frameSize) / 2 + 1)

    network2 = fun(input_var=input_var2,batch_size=batch_size,time_context=time_context,feat_size=input_size,nchannels=4)

    if load:
        params=load_model(model)
        lasagne.layers.set_all_param_values(network2,params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(batch_size,1,time_context,input_size))

    s1=prediction2[:,0:1,:,:]
    s2=prediction2[:,1:2,:,:]
    s3=prediction2[:,2:3,:,:]
    s4=prediction2[:,3:4,:,:]

    mask1=s1/(s1+s2+s3+s4+eps*rand_num)
    mask2=s2/(s1+s2+s3+s4+eps*rand_num)
    mask3=s3/(s1+s2+s3+s4+eps*rand_num)
    mask4=s4/(s1+s2+s3+s4+eps*rand_num)

    input_var = input_var2[:,0:1,:,:] + input_var2[:,1:2,:,:] + input_var2[:,2:3,:,:] + input_var2[:,3:4,:,:]

    source1=mask1*input_var[:,0:1,:,:]
    source2=mask2*input_var[:,0:1,:,:]
    source3=mask3*input_var[:,0:1,:,:]
    source4=mask4*input_var[:,0:1,:,:]

    train_loss_recon1 = lasagne.objectives.squared_error(source1,target_var2[:,0:1,:,:])
    train_loss_recon2 = lasagne.objectives.squared_error(source2,target_var2[:,1:2,:,:])
    train_loss_recon3 = lasagne.objectives.squared_error(source3,target_var2[:,2:3,:,:])
    train_loss_recon4 = lasagne.objectives.squared_error(source4,target_var2[:,3:4,:,:])

    error1=train_loss_recon1.sum()
    error2=train_loss_recon2.sum()
    error3=train_loss_recon3.sum()
    error4=train_loss_recon4.sum()

    loss=abs(error1+error2+error3+error4)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True)

    train_fn1 = theano.function([input_var2,target_var2], [error1,error2,error3,error4], allow_input_downcast=True)

    predict_function2=theano.function([input_var2],[source1,source2,source3,source4],allow_input_downcast=True)

    losser=[]
    min_loss = 1e14

    training_steps = 0

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):
            train = LargeDatasetMask2(path_transform_in=path_in, nsources=4, nsamples=nsamples, batch_size=batch_size, batch_memory=batch_memory, time_context=time_context, overlap=overlap, nprocs=nprocs,mult_factor_in=scale_factor,mult_factor_out=scale_factor,\
                sampleRate=transform.sampleRate,pitch_code='e', nharmonics=20, pitch_norm=127.,tensortype=theano.config.floatX,timbre_model_path=timbre_model_path)
            train_err = 0
            train_batches = 0
            err1=0
            err2=0
            err3=0
            err4=0
            start_time = time.time()
            for batch in range(train.iteration_size):

                inputs, target, masks = train()
                jump = inputs.shape[2]

                mask=np.empty(shape=(inputs.shape[0],4,inputs.shape[1],inputs.shape[2]),dtype=theano.config.floatX)
                mask[:,0,:,:]=masks[:,:,:jump] * inputs
                mask[:,1,:,:]=masks[:,:,jump:jump*2] * inputs
                mask[:,2,:,:]=masks[:,:,jump*2:jump*3] * inputs
                mask[:,3,:,:]=masks[:,:,jump*3:jump*4] * inputs
                masks=None

                targets=np.empty(shape=(inputs.shape[0],4,inputs.shape[1],inputs.shape[2]),dtype=theano.config.floatX)
                targets[:,0,:,:]=target[:,:,:jump]
                targets[:,1,:,:]=target[:,:,jump:jump*2]
                targets[:,2,:,:]=target[:,:,jump*2:jump*3]
                targets[:,3,:,:]=target[:,:,jump*3:jump*4]
                target=None

                inputs=None

                train_err+=train_fn(mask,targets)
                [e1,e2,e3,e4]=train_fn1(mask,targets)
                err1 += e1
                err2 += e2
                err3 += e3
                err4 += e4
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs, time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err/train_batches))
            logging.info("  training loss for bassoon:\t\t{:.6f}".format(err1/train_batches))
            logging.info("  training loss for clarinet:\t\t{:.6f}".format(err2/train_batches))
            logging.info("  training loss for saxophone:\t\t{:.6f}".format(err3/train_batches))
            logging.info("  training loss for violin:\t\t{:.6f}".format(err4/train_batches))
            losser.append(train_err / train_batches)
            #save_model(model,network2)
            # if (train_err/train_batches) < min_loss:
            #     min_loss = train_err/train_batches
            save_model(model,network2)

        # training_steps = training_steps + 1
        # num_epochs = int(np.ceil(float(num_epochs)/5.))

        # if losser[-1] > min_loss:
        #     params=load_model(model)
        #     lasagne.layers.set_all_param_values(network2,params,learning_rate=0.0001)

        # updates = lasagne.updates.adam(loss, params1)
        # train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True)


    if not skip_sep:

        logging.info("Separating")
        sources = ['bassoon','clarinet','saxphone','violin']
        sources_midi = ['bassoon','clarinet','saxophone','violin']

        train = LargeDatasetMask2(path_transform_in=path_in, nsources=4, batch_size=batch_size, batch_memory=batch_memory, time_context=time_context, overlap=overlap, nprocs=nprocs,mult_factor_in=scale_factor,mult_factor_out=scale_factor,\
                sampleRate=transform.sampleRate,pitch_code='e', nharmonics=20, pitch_norm=127.,tensortype=theano.config.floatX,timbre_model_path=timbre_model_path)

        for f in testfile_list:
            nelem_g=1
            for i in range(len(sources)):
                ng = util.getMidiNum(sources_midi[i]+'_b',os.path.join(testdir,f),0,40.0)
                nelem_g = np.maximum(ng,nelem_g)
            melody = np.zeros((len(sources),int(nelem_g),2*nharmonics+3))
            for i in range(len(sources)):
                filename=os.path.join(testdir,f,f+'-'+sources[i]+'.wav')
                audioObj, sampleRate, bitrate = util.readAudioScipy(filename)

                assert sampleRate == 44100,"Sample rate needs to be 44100"

                nframes = int(np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2
                if i==0:
                    audio = np.zeros(audioObj.shape[0])
                audio = audio + audioObj
                audioObj=None

                tmp = util.expandMidi(sources_midi[i]+'_b',os.path.join(testdir,f),0,40.0,interval,tuning_freq,nharmonics,sampleRate,tt.hopSize,tt.frameSize,0.2,0.2,nframes,0.5)
                melody[i,:tmp.shape[0],:] = tmp
                tmp = None

            mag,ph=transform.compute_file(audio,phase=True)
            mag=scale_factor*mag.astype(np.float32)

            jump = mag.shape[-1]

            masks_temp = train.filterSpec(mag,melody,0,nframes)
            masks = np.ones((train.ninst,mag.shape[0],mag.shape[1]))
            masks[0,:,:]=masks_temp[:,:jump] * mag
            masks[1,:,:]=masks_temp[:,jump:jump*2] * mag
            masks[2,:,:]=masks_temp[:,jump*2:jump*3] * mag
            masks[3,:,:]=masks_temp[:,jump*3:jump*4] * mag
            mag = None
            masks_temp = None

            batches,nchunks = util.generate_overlapadd(masks,input_size=masks.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=44100)
            masks = None

            batch_no=1
            output=[]
            for batch in batches:
                batch_no+=1
                #start_time=time.time()
                output.append(predict_function2(batch))

            output=np.array(output)
            mm=util.overlapadd_multi(output,batches,nchunks,overlap=train.overlap)
            for i in range(len(sources)):
                audio_out=transform.compute_inverse(mm[i,:len(ph)]/scale_factor,ph)
                if len(audio_out)>len(audio):
                    audio_out=audio_out[:len(audio)]
                util.writeAudioScipy(os.path.join(outdir,f+'-'+sources[i]+'.wav'),audio_out,sampleRate,bitrate)
                audio_out=None

        # style = ['fast','slow','original']
        # style_midi = ['_fast20','_slow20','_original']
        # if not os.path.exists(outdir1):
        #     os.makedirs(outdir1)
        # for s in range(len(style)):
        #     for f in testfile_list1:
        #         nelem_g=1
        #         for i in range(len(sources)):
        #             ng = util.getMidiNum(sources_midi[i]+'_g'+style_midi[s],os.path.join(testdir1,f),0,40.0)
        #             nelem_g = np.maximum(ng,nelem_g)
        #         melody = np.zeros((len(sources),int(nelem_g),2*nharmonics+3))
        #         for i in range(len(sources)):
        #             filename=os.path.join(testdir1,f,f+'_'+style[s]+'_'+sources_midi[i]+'.wav')

        #             audioObj, sampleRate, bitrate = util.readAudioScipy(filename)

        #             assert sampleRate == 44100,"Sample rate needs to be 44100"

        #             nframes = int(np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2

        #             if i==0:
        #                 audio = np.zeros(audioObj.shape[0])

        #             audio = audio + audioObj
        #             audioObj=None

        #             tmp = util.expandMidi(sources_midi[i]+'_g'+style_midi[s],os.path.join(testdir1,f),0,40.0,interval,tuning_freq,nharmonics,sampleRate,tt.hopSize,tt.frameSize,0.2,0.2,nframes)
        #             melody[i,:tmp.shape[0],:] = tmp
        #             tmp = None

        #         mag,ph=transform.compute_file(audio,phase=True)
        #         mag=scale_factor*mag.astype(np.float32)

        #         jump = mag.shape[-1]

        #         masks_temp = train.filterSpec(mag,melody,0,nframes)
        #         masks = np.ones((train.ninst,mag.shape[0],mag.shape[1]))
        #         masks[0,:,:]=masks_temp[:,:jump] * mag
        #         masks[1,:,:]=masks_temp[:,jump:jump*2] * mag
        #         masks[2,:,:]=masks_temp[:,jump*2:jump*3] * mag
        #         masks[3,:,:]=masks_temp[:,jump*3:jump*4] * mag
        #         mag = None
        #         masks_temp = None

        #         batches,nchunks = util.generate_overlapadd(masks,input_size=masks.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=44100)
        #         masks = None

        #         batch_no=1
        #         output=[]
        #         for batch in batches:
        #             batch_no+=1
        #             #start_time=time.time()
        #             output.append(predict_function2(batch))

        #         output=np.array(output)
        #         mm=util.overlapadd_multi(output,batches,nchunks,overlap=train.overlap)
        #         for i in range(len(sources)):
        #             audio_out=transform.compute_inverse(mm[i,:len(ph)]/scale_factor,ph)
        #             if len(audio_out)>len(audio):
        #                 audio_out=audio_out[:len(audio)]
        #             filename=os.path.join(outdir1,f+'_'+style[s]+'_'+sources_midi[i]+'.wav')
        #             util.writeAudioScipy(filename,audio_out,sampleRate,bitrate)
        #             audio_out=None

    return losser
Пример #5
0
                try:
                    if mix_raw.shape[1] > 1:
                        mix_raw[:, 0] = (mix_raw[:, 0] + mix_raw[:, 1]) / 2
                        mix_raw = mix_raw[:, 0]
                except Exception, e:
                    pass

                number_of_blocks = int(
                    len(mix_raw) / (float(sampleRate) * 30.0))
                last_block = int(len(mix_raw) % float(sampleRate))

                # Write the mix file
                mixOutDir = os.path.join(mixture_directory, "Dev", f)
                # Save the variation of the mixture
                util.writeAudioScipy(
                    os.path.join(mixOutDir,
                                 'mixture_' + str(ins + 1) + '.wav'), mix_raw,
                    sampleRate, bitrate)

                if tt is None:
                    #initialize the transform object which will compute the STFT
                    tt = transformFFT(frameSize=1024,
                                      hopSize=512,
                                      sampleRate=sampleRate,
                                      window=blackmanharris)

                assert sampleRate == 44100, "Sample rate needs to be 44100"

                #Take chunks of 30 secs
                for i in range(number_of_blocks):
                    audio = np.zeros((sampleRate * 30, 5))
                    audio[:, 0] = mix_raw[i * 30 * sampleRate:(i + 1) * 30 *
Пример #6
0
            mix_raw = np.zeros(len(others))
            mix_raw = bass + drums + others + vocals
            try:
            	if mix_raw.shape[1]>1:
                    mix_raw[:,0] = (mix_raw[:,0] + mix_raw[:,1]) / 2
                    mix_raw = mix_raw[:,0]
            except Exception, e:
            	pass

            number_of_blocks=int(len(mix_raw)/(float(sampleRate)*30.0))
            last_block=int(len(mix_raw)%float(sampleRate))

            # Write the mix file if it does not exist
            mixOut = os.path.join(mixture_directory,"Dev",f,"mixture.wav")
            if not os.path.isfile(mixOut): 
                util.writeAudioScipy(os.path.join(mixOut),mix_raw,sampleRate,bitrate)

            if tt is None:
                #initialize the transform object which will compute the STFT
                tt=transformFFT(frameSize=1024, hopSize=512, sampleRate=sampleRate, window=blackmanharris)
 
            assert sampleRate == 44100,"Sample rate needs to be 44100"
    
            #Take chunks of 30 secs
            for i in range(number_of_blocks):
                audio = np.zeros((sampleRate*30,5))
                audio[:,0]=mix_raw[i*30*sampleRate:(i+1)*30*sampleRate] 
                audio[:,1]=vocals[i*sampleRate*30:(i+1)*30*sampleRate]
                audio[:,2]=bass[i*sampleRate*30:(i+1)*sampleRate*30]
                audio[:,3]=drums[i*sampleRate*30:(i+1)*sampleRate*30]
                audio[:,4]=others[i*sampleRate*30:(i+1)*sampleRate*30]
Пример #7
0
def train_auto(fun,
               train,
               transform,
               testdir,
               outdir,
               num_epochs=30,
               model="1.pkl",
               scale_factor=0.3,
               load=False,
               skip_train=False,
               skip_sep=False,
               chunk_size=60,
               chunk_overlap=2,
               nsamples=40,
               batch_size=32,
               batch_memory=50,
               time_context=30,
               overlap=25,
               nprocs=4,
               mult_factor_in=0.3,
               mult_factor_out=0.3):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    fun : lasagne network object, Theano tensor
        The network to be trained
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features_DSD100.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network)
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var = T.tensor4('inputs')
    input_mask = T.tensor4('input_mask')
    target_var = T.tensor4('targets')

    theano_rng = RandomStreams(128)

    eps = 1e-12

    sources = ['vocals', 'bass', 'drums', 'other']

    nchannels = int(train.channels_in)
    nsources = int(train.channels_out / train.channels_in)

    print 'nchannels: ', nchannels
    print 'nsources: ', nsources

    input_size = int(float(transform.frameSize) / 2 + 1)

    rand_num = theano_rng.normal(size=(batch_size, nsources, time_context,
                                       input_size),
                                 avg=0.0,
                                 std=0.1,
                                 dtype=theano.config.floatX)

    net = fun(input_var=input_var,
              batch_size=batch_size,
              time_context=time_context,
              feat_size=input_size,
              nchannels=nchannels,
              nsources=nsources)
    network = net['l_out']
    if load:
        params = load_model(model)
        lasagne.layers.set_all_param_values(network, params)

    prediction = lasagne.layers.get_output(network, deterministic=True)

    sourceall = []
    errors_insts = []
    loss = 0

    sep_chann = []

    # prediction example for 2 sources in 2 channels:
    # 0, 1 source 0 in channel 0 and 1
    # 2, 3 source 1 in channel 0 and 1
    for j in range(nchannels):
        #print "j: ", j
        masksum = T.sum(prediction[:, j::nchannels, :, :], axis=1)
        temp = T.tile(masksum.dimshuffle(0, 'x', 1, 2), (1, nsources, 1, 1))
        mask = prediction[:, j::nchannels, :, :] / (temp + eps * rand_num)
        source = mask * T.tile(input_var[:, j:j + 1, :, :],
                               (1, nsources, 1, 1)) + eps * rand_num
        sourceall.append(source)

        sep_chann.append(source)
        train_loss_recon = lasagne.objectives.squared_error(
            source, target_var[:, j::nchannels, :, :])

        errors_inst = abs(train_loss_recon.sum(axis=(0, 2, 3)))

        errors_insts.append(errors_inst)

        loss = loss + abs(train_loss_recon.sum())

    params1 = lasagne.layers.get_all_params(network, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn_mse = theano.function([input_var, target_var],
                                   loss,
                                   updates=updates,
                                   allow_input_downcast=True)

    train_fn1 = theano.function([input_var, target_var],
                                errors_insts,
                                allow_input_downcast=True)

    #----------NEW ILD LOSS CONDITION----------

    rand_num2 = theano_rng.normal(
        size=(batch_size, nsources, time_context, input_size),
        avg=0.0,
        std=0.1,
        dtype=theano.config.floatX)  #nsources a primera dim?

    #estimate

    interaural_spec_est = sep_chann[0] / (sep_chann[1] + eps * rand_num2)

    alpha_est = 20 * np.log10(abs(interaural_spec_est + eps * rand_num2))
    alpha_est_mean = alpha_est.mean(axis=(0, 1, 2))

    #groundtruth

    interaural_spec_gt = target_var[:, 0::nchannels, :, :] / (
        target_var[:, 1::nchannels, :, :] + eps * rand_num2)

    alpha_gt = 20 * np.log10(abs(interaural_spec_gt + eps * rand_num2))
    alpha_gt_mean = alpha_gt.mean(
        axis=(0, 1, 2))  #aixo hauria de ser un vector d'una dimensio

    train_loss_ild = lasagne.objectives.squared_error(alpha_est_mean,
                                                      alpha_gt_mean)

    loss = loss + (abs(train_loss_ild.sum()) / 500)

    #------------------------------------------

    predict_function = theano.function([input_var],
                                       sourceall,
                                       allow_input_downcast=True)

    losser = []

    if not skip_train:
        logging.info("Training stage 1 (mse)...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            errs = np.zeros((nchannels, nsources))
            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()
                train_err += train_fn_mse(inputs, target)
                errs += np.array(train_fn1(inputs, target))
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err /
                                                             train_batches))
            for j in range(nchannels):
                for i in range(nsources):
                    logging.info("  training loss for " + sources[i] +
                                 " in mic " + str(j) +
                                 ":\t\t{:.6f}".format(errs[j][i] /
                                                      train_batches))

            model_noILD = model[:-4] + '_noILD' + model[-4:]
            print 'model_noILD: ', model_noILD
            save_model(model_noILD, network)
            losser.append(train_err / train_batches)


#NEW ILD TRAINING---------------------------------------------------------

        params = load_model(model_noILD)
        lasagne.layers.set_all_param_values(network, params)
        params1 = lasagne.layers.get_all_params(network, trainable=True)
        updates = lasagne.updates.adadelta(loss, params1)
        train_fn_ILD = theano.function([input_var, target_var],
                                       loss,
                                       updates=updates,
                                       allow_input_downcast=True)

        logging.info("Training stage 2 (ILD)...")

        for epoch in range(int(num_epochs / 2)):

            train_err = 0
            train_batches = 0

            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()

                train_err += train_fn_ILD(inputs, target)
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err /
                                                             train_batches))

            save_model(model, network)
            losser.append(train_err / train_batches)

    if not skip_sep:

        logging.info("Separating")

        subsets = ['Dev', 'Test']
        for sub in subsets:
            for d in sorted(os.listdir(os.path.join(db, 'Mixtures', sub))):
                print os.path.join(os.path.sep, db, 'Mixtures', sub, d,
                                   'mixture.wav')
                audio, sampleRate, bitrate = util.readAudioScipy(
                    os.path.join(os.path.sep, db, 'Mixtures', sub, d,
                                 'mixture.wav'))
                nsamples = audio.shape[0]
                sep_audio = np.zeros((nsamples, len(sources), audio.shape[1]))

                mag, ph = transform.compute_transform(audio, phase=True)
                mag = scale_factor * mag.astype(np.float32)
                #print 'mag.shape: ', mag.shape, 'batch_size: ', train.batch_size
                nframes = mag.shape[-2]

                batches_mag, nchunks = util.generate_overlapadd(
                    mag,
                    input_size=mag.shape[-1],
                    time_context=train.time_context,
                    overlap=train.overlap,
                    batch_size=train.batch_size,
                    sampleRate=sampleRate)
                mag = None

                output = []
                for b in range(len(batches_mag)):
                    output.append(predict_function(batches_mag[b]))
                output = np.array(output)

                for j in range(audio.shape[1]):
                    mm = util.overlapadd_multi(np.swapaxes(
                        output[:, j:j + 1, :, :, :, :], 1, 3),
                                               batches_mag,
                                               nchunks,
                                               overlap=train.overlap)
                    for i in range(len(sources)):
                        audio_out = transform.compute_inverse(
                            mm[i, :ph.shape[1], :] / scale_factor, ph[j])
                        # if len(sep_audio[:i,j])<len(audio_out):
                        #     print len(sep_audio), len(audio_out), len(audio_out)-len(sep_audio[:i,j])
                        #     sep_audio = np.concatenate(sep_audio,np.zeros(len(audio_out)-len(sep_audio[:i,j])))
                        #     print len(sep_audio), len(audio_out), len(audio_out)-len(sep_audio[:i,j])
                        sep_audio[:, i, j] = audio_out[:len(sep_audio)]

                print 'Saving separation: ', outdir
                if not os.path.exists(os.path.join(outdir)):
                    os.makedirs(os.path.join(outdir))
                    print 'Creating model folder'
                if not os.path.exists(os.path.join(outdir, 'Sources')):
                    os.makedirs(os.path.join(outdir, 'Sources'))
                    print 'Creating Sources folder: ', os.path.join(
                        outdir, 'Sources')
                if not os.path.exists(os.path.join(outdir, 'Sources', sub)):
                    os.makedirs(os.path.join(outdir, 'Sources', sub))
                    print 'Creating subset folder'
                if not os.path.exists(os.path.join(outdir, 'Sources', sub, d)):
                    os.makedirs(os.path.join(outdir, 'Sources', sub, d))
                    print 'Creating song folder', os.path.join(
                        outdir, 'Sources', sub, d)
                for i in range(len(sources)):
                    print 'Final audio file: ', i, os.path.join(
                        outdir, 'Sources', sub, d, sources[i] + '.wav'
                    ), 'nsamples: ', nsamples, 'len sep_audio :', len(
                        sep_audio)
                    util.writeAudioScipy(
                        os.path.join(outdir, 'Sources', sub, d,
                                     sources[i] + '.wav'),
                        sep_audio[:nsamples, i, :], sampleRate, bitrate)

    return losser