from userconfig import * import util from layers_custom import * ################################################################################################################### # create Theano variables for input minibatch input_var = T.tensor4('X') # note that in general, the main data tensors will have these axes: # - minibatchsize # - numchannels (always 1 for us, since spectrograms) # - numfilts (or specbinnum for input) # - numtimebins if example_is_audio: # load our example audio file as a specgram examplegram = util.standard_specgram((util.load_soundfile(examplewavpath, 0))) print("examplegram is of shape %s" % str(np.shape(examplegram))) ################################################################################################################### # here we define our "semi-convolutional" autoencoder # NOTE: lasagne assumes pooling is on the TRAILING axis of the tensor, so we always use time as the trailing axis def make_custom_convlayer(network, in_num_chans, out_num_chans): "Applies our special padding and reshaping to do 1D convolution on 2D data" network = lasagne.layers.PadLayer(network, width=(featframe_len-1)/2, batch_ndim=3) # NOTE: the "batch_ndim" is used to stop batch dims being padded, but here ALSO to skip first data dim print("shape after pad layer: %s" % str(network.output_shape)) network = lasagne.layers.Conv2DLayer(network, out_num_chans, (in_num_chans, featframe_len), stride=(1,1), pad=0, nonlinearity=very_leaky_rectify, W=lasagne.init.Orthogonal()) # we pad "manually" in order to do it in one dimension only filters = network.W network = lasagne.layers.ReshapeLayer(network, ([0], [2], [1], [3])) # reinterpret channels as rows print("shape after conv layer: %s" % str(network.output_shape)) return network, filters
from userconfig import * import util from layers_custom import * ################################################################################################################### # create Theano variables for input minibatch input_var = T.tensor4('X') # note that in general, the main data tensors will have these axes: # - minibatchsize # - numchannels (always 1 for us, since spectrograms) # - numfilts (or specbinnum for input) # - numtimebins if example_is_audio: # load our example audio file as a specgram foregroundgram = util.standard_specgram( (util.load_soundfile(foregroundwavpath, 0))) backgroundgram = util.standard_specgram( (util.load_soundfile(backgroundwavpath, 0))) print("foregroundgram is of shape %s" % str(np.shape(foregroundgram))) print("backgroundgram is of shape %s" % str(np.shape(backgroundgram))) ################################################################################################################### # here we define our "semi-convolutional" autoencoder # NOTE: lasagne assumes pooling is on the TRAILING axis of the tensor, so we always use time as the trailing axis def make_custom_convlayer(network, in_num_chans, out_num_chans): "Applies our special padding and reshaping to do 1D convolution on 2D data" network = lasagne.layers.PadLayer( network, width=(featframe_len - 1) / 2, batch_ndim=3 ) # NOTE: the "batch_ndim" is used to stop batch dims being padded, but here ALSO to skip first data dim
from userconfig import * import util from layers_custom import * ################################################################################################################### # create Theano variables for input minibatch input_var = T.tensor4('X') # note that in general, the main data tensors will have these axes: # - minibatchsize # - numchannels (always 1 for us, since spectrograms) # - numfilts (or specbinnum for input) # - numtimebins if example_is_audio: # load our example audio file as a specgram foregroundgram = util.standard_specgram((util.load_soundfile(foregroundwavpath, 0))) backgroundgram = util.standard_specgram((util.load_soundfile(backgroundwavpath, 0))) print("foregroundgram is of shape %s" % str(np.shape(foregroundgram))) print("backgroundgram is of shape %s" % str(np.shape(backgroundgram))) ################################################################################################################### # here we define our "semi-convolutional" autoencoder # NOTE: lasagne assumes pooling is on the TRAILING axis of the tensor, so we always use time as the trailing axis def make_custom_convlayer(network, in_num_chans, out_num_chans): "Applies our special padding and reshaping to do 1D convolution on 2D data" network = lasagne.layers.PadLayer(network, width=(featframe_len-1)/2, batch_ndim=3) # NOTE: the "batch_ndim" is used to stop batch dims being padded, but here ALSO to skip first data dim print("shape after pad layer: %s" % str(network.output_shape)) network = lasagne.layers.Conv2DLayer(network, out_num_chans, (in_num_chans, featframe_len), stride=(1,1), pad=0, nonlinearity=very_leaky_rectify, W=lasagne.init.Orthogonal()) # we pad "manually" in order to do it in one dimension only filters = network.W