def readHtk(filename, chunk_size=None, preSamples=None): """ Reads the features in a HTK file, and returns them in a 2-D numpy array. chunk_size: integer specifying number of samples per chunk. preSamples: integer specifying the number of samples to prepend to a chunk to try and deal with issues at chunk boundaries. Safe to assume that if chunk_size is not None, preSamples will also not be None. """ # Only do chunking if chunk_size is passed to the function. if chunk_size is not None: assert chunk_size > 0, "chunk_size needs to be > 0" with smart_open(filename, "rb") as f: nSamples, sampPeriod, sampSize, parmKind = struct.unpack( ">iihh", f.read(12)) assert nSamples > 0, "nSamples needs to be > 0" assert sampSize > 0, "sampSize needs to be > 0" # If the size of the features is less than the chunk size. if nSamples < chunk_size: chunk_size = nSamples # Iterate over all full chunks first. for i in range(nSamples // chunk_size): # We want to add a few samples to the beginning of each chunk, # but only after the first one. if i == 0: readSize = chunk_size * sampSize dataSize = readSize // 4 outputSize = chunk_size else: readSize = (chunk_size + preSamples) * sampSize dataSize = readSize // 4 outputSize = chunk_size + preSamples data = struct.unpack(">%df" % (dataSize), f.read(readSize)) yield numpy.array(data).reshape(outputSize, sampSize // 4) # Move the file cursor back so that the next chunk reuses # some of the same samples. f.seek(-(preSamples * sampSize), 1) # Whatever remains after the last full chunk size. chunk_size = nSamples - (chunk_size * (nSamples // chunk_size)) + preSamples if chunk_size > preSamples: data = struct.unpack(">%df" % (chunk_size * sampSize / 4), f.read(chunk_size * sampSize)) yield numpy.array(data).reshape(chunk_size, sampSize // 4) else: with smart_open(filename, "rb") as f: #Read header nSamples, sampPeriod, sampSize, parmKind = struct.unpack( ">iihh", f.read(12)) # Read data data = struct.unpack(">%df" % (nSamples * sampSize / 4), f.read(nSamples * sampSize)) # print(type(data), len(data), nSamples, sampSize, nSamples*sampSize/4) yield numpy.array(data).reshape(nSamples, sampSize // 4)
def readFmatrix(filename): """ Reads a float matrix from a Janus feature file. """ with smart_open(filename, "rb") as f: _, rows, cols, _ = struct.unpack(">4i", f.read(16)) return numpy.array(struct.unpack(">%df" % (rows * cols), f.read())).reshape(rows, cols)
def writeFmatrix(filename, matrix): """ Writes a float matrix to a Janus feature file. """ with smart_open(filename, "wb") as f: f.write("FMAT") f.write(struct.pack(">3i", matrix.shape[0], matrix.shape[1], 0)) f.write(struct.pack(">%df" % matrix.size, *matrix.ravel()))
def writeScp(filename, uttids, pointers): """ Takes a list of utterance IDs and a list of strings in the format "filename:offset", and writes them to a Kaldi script file. """ with smart_open(filename, "w") as f: for uttid, pointer in zip(uttids, pointers): f.write("%s %s\n" % (uttid, pointer))
def readScp(filename, limit = numpy.inf): """ Reads the features in a Kaldi script file. Returns a list of feature matrices and a list of the utterance IDs. """ features = []; uttids = [] with smart_open(filename, "r") as f: for line in f: uttid, pointer = line.strip().split() p = pointer.rfind(":") arkfile, offset = pointer[:p], int(pointer[p+1:]) with smart_open(arkfile, "rb") as g: g.seek(offset) feature = readMatrix(g) features.append(feature) uttids.append(uttid) if len(features) == limit: break return features, uttids
def readPfile(filename): """ Reads the contents of a pfile. Returns a tuple (features, labels), where both elements are lists of 2-D numpy arrays. Each element of a list corresponds to a sentence; each row of a 2-D array corresponds to a frame. In the case where the pfile doesn't contain labels, "labels" will be None. """ with smart_open(filename, "rb") as f: # Read header # Assuming all data are consistent for line in f: tokens = line.split() if tokens[0] == "-pfile_header": headerSize = int(tokens[4]) elif tokens[0] == "-num_sentences": nSentences = int(tokens[1]) elif tokens[0] == "-num_frames": nFrames = int(tokens[1]) elif tokens[0] == "-first_feature_column": cFeature = int(tokens[1]) elif tokens[0] == "-num_features": nFeatures = int(tokens[1]) elif tokens[0] == "-first_label_column": cLabel = int(tokens[1]) elif tokens[0] == "-num_labels": nLabels = int(tokens[1]) elif tokens[0] == "-format": format = tokens[1].replace("d", "i") elif tokens[0] == "-end": break nCols = len(format) dataSize = nFrames * nCols # Read sentence index f.seek(headerSize + dataSize * 4) index = struct.unpack(">%di" % (nSentences + 1), f.read(4 * (nSentences + 1))) # Read data f.seek(headerSize) features = [] labels = [] sen = 0 for i in xrange(nFrames): if i == index[sen]: features.append([]) labels.append([]) sen += 1 data = struct.unpack(">" + format, f.read(4 * nCols)) features[-1].append(data[cFeature:cFeature + nFeatures]) labels[-1].append(data[cLabel:cLabel + nLabels]) features = [numpy.array(x) for x in features] labels = [numpy.array(x) for x in labels] if nLabels > 0 else None return (features, labels)
def readPfile(filename): """ Reads the contents of a pfile. Returns a tuple (feature, label), where both elements are lists of 2-D numpy arrays. Each element of a list corresponds to a sentence; each row of a 2-D array corresponds to a frame. In the case where the pfile doesn't contain labels, "label" will be None. """ with smart_open(filename, "rb") as f: # Read header # Assuming all data are consistent for line in f: tokens = line.split() if tokens[0] == "-pfile_header": headerSize = int(tokens[4]) elif tokens[0] == "-num_sentences": nSentences = int(tokens[1]) elif tokens[0] == "-num_frames": nFrames = int(tokens[1]) elif tokens[0] == "-first_feature_column": cFeature = int(tokens[1]) elif tokens[0] == "-num_features": nFeatures = int(tokens[1]) elif tokens[0] == "-first_label_column": cLabel = int(tokens[1]) elif tokens[0] == "-num_labels": nLabels = int(tokens[1]) elif tokens[0] == "-format": format = tokens[1].replace("d", "i") elif tokens[0] == "-end": break nCols = len(format) dataSize = nFrames * nCols # Read sentence index f.seek(headerSize + dataSize * 4) index = struct.unpack(">%di" % (nSentences + 1), f.read(4 * (nSentences + 1))) # Read data f.seek(headerSize) feature = [] label = [] sen = 0 for i in xrange(nFrames): if i == index[sen]: feature.append([]) label.append([]) sen += 1 data = struct.unpack(">" + format, f.read(4 * nCols)) feature[-1].append(data[cFeature : cFeature + nFeatures]) label[-1].append(data[cLabel : cLabel + nLabels]) feature = [numpy.array(x) for x in feature] label = [numpy.array(x) for x in label] if nLabels > 0 else None return (feature, label)
def writeHtk(filename, feature, sampPeriod, parmKind): """ Writes the features in a 2-D numpy array into a HTK file. """ with smart_open(filename, "wb") as f: # Write header nSamples = feature.shape[0] sampSize = feature.shape[1] * 4 f.write(struct.pack(">iihh", nSamples, sampPeriod, sampSize, parmKind)) # Write data f.write(struct.pack(">%df" % (nSamples * sampSize / 4), *feature.ravel()))
def writePfile(filename, features, labels=None): """ Writes "features" and "labels" to a pfile. Both "features" and "labels" should be lists of 2-D numpy arrays. Each element of a list corresponds to a sentence; each row of a 2-D array corresponds to a frame. In the case where there is only one label per frame, the elements of the "labels" list can be 1-D arrays. """ nSentences = len(features) nFrames = sum(len(x) for x in features) nFeatures = len(numpy.array(features[0][0]).ravel()) nLabels = len(numpy.array( labels[0][0]).ravel()) if labels is not None else 0 nCols = 2 + nFeatures + nLabels headerSize = 32768 dataSize = nFrames * nCols with smart_open(filename, "wb") as f: # Write header writeBytes(f, "-pfile_header version 0 size %d\n" % headerSize) writeBytes(f, "-num_sentences %d\n" % nSentences) writeBytes(f, "-num_frames %d\n" % nFrames) writeBytes(f, "-first_feature_column 2\n") writeBytes(f, "-num_features %d\n" % nFeatures) writeBytes(f, "-first_label_column %d\n" % (2 + nFeatures)) writeBytes(f, "-num_labels %d\n" % nLabels) writeBytes(f, "-format dd" + "f" * nFeatures + "d" * nLabels + "\n") writeBytes( f, "-data size %d offset 0 ndim 2 nrow %d ncol %d\n" % (dataSize, nFrames, nCols)) writeBytes( f, "-sent_table_data size %d offset %d ndim 1\n" % (nSentences + 1, dataSize)) writeBytes(f, "-end\n") # Write data f.seek(headerSize) for i in range(nSentences): for j in range(len(features[i])): f.write(struct.pack(">2i", i, j)) f.write( struct.pack(">%df" % nFeatures, *numpy.array(features[i][j]).ravel())) if labels is not None: f.write( struct.pack( ">%di" % nLabels, *numpy.array(labels[i][j].astype(int)).ravel())) # Write sentence index index = numpy.cumsum([0] + [len(x) for x in features]) f.write(struct.pack(">%di" % (nSentences + 1), *index))
def readHtk(filename): """ Reads the features in a HTK file, and returns them in a 2-D numpy array. """ with smart_open(filename, "rb") as f: # Read header nSamples, sampPeriod, sampSize, parmKind = struct.unpack(">iihh", f.read(12)) # sampPeriod and parmKind will be omitted # Read data data = struct.unpack(">%df" % (nSamples * sampSize / 4), f.read(nSamples * sampSize)) return numpy.array(data).reshape(nSamples, sampSize / 4)
def writeHtk(filename, feature, sampPeriod, parmKind): """ Writes the features in a 2-D numpy array into a HTK file. """ with smart_open(filename, "wb") as f: # Write header nSamples = feature.shape[0] sampSize = feature.shape[1] * 4 f.write(struct.pack(">iihh", nSamples, sampPeriod, sampSize, parmKind)) # Write data f.write( struct.pack(">%df" % (nSamples * sampSize / 4), *feature.ravel()))
def writeArk(filename, features, uttids): """ Takes a list of feature matrices and a list of utterance IDs, and writes them to a Kaldi ark file. Returns a list of strings in the format "filename:offset", which can be used to write a Kaldi script file. """ pointers = [] with smart_open(filename, "wb") as f: for feature, uttid in zip(features, uttids): writeString(f, uttid) pointers.append("%s:%d" % (filename, f.tell())) writeMatrix(f, feature) return pointers
def readHtk(filename): """ Reads the features in a HTK file, and returns them in a 2-D numpy array. """ with smart_open(filename, "rb") as f: # Read header nSamples, sampPeriod, sampSize, parmKind = struct.unpack( ">iihh", f.read(12)) # sampPeriod and parmKind will be omitted # Read data data = struct.unpack(">%df" % (nSamples * sampSize / 4), f.read(nSamples * sampSize)) return numpy.array(data).reshape(nSamples, int(sampSize / 4))
def readArk(filename, limit = numpy.inf): """ Reads the features in a Kaldi ark file. Returns a list of feature matrices and a list of the utterance IDs. """ features = []; uttids = [] with smart_open(filename, "rb") as f: while True: try: uttid = readString(f) except ValueError: break feature = readMatrix(f) features.append(feature) uttids.append(uttid) if len(features) == limit: break return features, uttids
def writePfile(filename, feature, label=None): """ Writes "feature" and "label" to a pfile. Both inputs "feature" and "label" should be lists of 2-D numpy arrays. Each element of a list corresponds to a sentence; each row of a 2-D array corresponds to a frame. In the case where there is only one label per frame, the elements of the "label" list can be 1-D arrays. """ nSentences = len(feature) nFrames = sum(len(x) for x in feature) nFeatures = len(numpy.array(feature[0][0]).ravel()) nLabels = len(numpy.array(label[0][0]).ravel()) if label is not None else 0 nCols = 2 + nFeatures + nLabels headerSize = 32768 dataSize = nFrames * nCols with smart_open(filename, "wb") as f: # Write header f.write("-pfile_header version 0 size %d\n" % headerSize) f.write("-num_sentences %d\n" % nSentences) f.write("-num_frames %d\n" % nFrames) f.write("-first_feature_column 2\n") f.write("-num_features %d\n" % nFeatures) f.write("-first_label_column %d\n" % (2 + nFeatures)) f.write("-num_labels %d\n" % nLabels) f.write("-format dd" + "f" * nFeatures + "d" * nLabels + "\n") f.write("-data size %d offset 0 ndim 2 nrow %d ncol %d\n" % (dataSize, nFrames, nCols)) f.write("-sent_table_data size %d offset %d ndim 1\n" % (nSentences + 1, dataSize)) f.write("-end\n") # Write data f.seek(headerSize) for i in xrange(nSentences): for j in xrange(len(feature[i])): f.write(struct.pack(">2i", i, j)) f.write(struct.pack(">%df" % nFeatures, *numpy.array(feature[i][j]).ravel())) if label is not None: f.write(struct.pack(">%di" % nLabels, *numpy.array(label[i][j]).ravel())) # Write sentence index index = numpy.cumsum([0] + [len(x) for x in feature]) f.write(struct.pack(">%di" % (nSentences + 1), *index))
def writeAudioSet(filename, wav, labels): """ Writes audio and labels to a Google Audio Set file (disguised as .flac). Takes two variables as input: * wav -- a 2-D numpy array, where each row is a waveform (10 seconds @ 16 kHz, mono, dtype is arbitrary); * labels -- a 2-D numpy array of zeros and ones, where each row indicates the sound events active in the corresponding waveform. The number of rows in the two variables must match. The audio is concatenated and compressed in the FLAC format. The labels are appended to the FLAC audio file. This function relies on ffmpeg. """ # Validate input if len(wav) != len(labels): raise ValueError( "The number of rows in 'wav' and 'labels' must match.") # Convert wav to int16, ensuring the correct range nClips, nSamples = wav.shape if numpy.abs(wav).max() <= 1: wav *= 32768 wav = numpy.maximum(numpy.minimum(wav, 32767), -32768).astype("int16") # Convert labels to bit arrays nLabels = labels.shape[1] labels = labels.astype("uint8") nBytes = (nLabels - 1) / 8 + 1 bytes = numpy.zeros((nClips, nBytes), dtype="uint8") for i in xrange(nLabels): bytes[:, i / 8] += labels[:, i] << (i % 8) # Write file wavfile.write(filename + ".wav", 16000, wav.ravel()) subprocess.check_output("ffmpeg -i %s.wav -c:a flac -y %s && rm %s.wav" % (filename, filename, filename), shell=True) with smart_open(filename, "ab") as f: f.write(struct.pack("<%dB" % bytes.size, *bytes.ravel())) f.write(struct.pack("<3i", nClips, nSamples, nLabels))
def readAudioSet(filename): """ Reads audio and labels from a Google Audio Set file (disguised as .flac). Returns two variables: * wav -- a 2-D numpy float32 array, where each row is a waveform (10 seconds @ 16 kHz, mono); * labels -- a 2-D numpy int32 array of zeros and ones, where each row indicates the sound events active in the corresponding waveform. """ wav, _ = librosa.core.load(filename, sr=16000, dtype="float32") with smart_open(filename, "rb") as f: f.seek(-12, 2) nClips, nSamples, nLabels = struct.unpack("<3i", f.read(12)) wav = wav.reshape(nClips, nSamples) nBytes = (nLabels - 1) / 8 + 1 f.seek(-12 - nClips * nBytes, 2) data = struct.unpack("<%dB" % (nClips * nBytes), f.read(nClips * nBytes)) bytes = numpy.array(data).reshape(nClips, nBytes) labels = numpy.zeros((nClips, nLabels), dtype="int32") for i in xrange(nLabels): labels[:, i] = (bytes[:, i / 8] >> (i % 8)) & 1 return wav, labels
def save(self, filename): with smart_open(filename, "wb") as f: dill.dump(self.getParams(), f)
def __init__( self, Nlayers=1, # number of layers Ndirs=1, # unidirectional or bidirectional Nx=100, # input size Nh=100, # hidden layer size Ny=100, # output size Ah="relu", # hidden unit activation (e.g. relu, tanh, lstm) Ay="linear", # output unit activation (e.g. linear, sigmoid, softmax) predictPer="frame", # frame or sequence loss=None, # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge) L1reg=0.0, # L1 regularization L2reg=0.0, # L2 regularization dropout=0.0, # dropout momentum=0.0, # SGD momentum seed=15213, # random seed for initializing the weights frontEnd=None, # a lambda function for transforming the input filename=None, # initialize from file initParams=None, # initialize from given dict ): if filename is not None: # load parameters from file with smart_open(filename, "rb") as f: initParams = dill.load(f) if initParams is not None: # load parameters from given dict self.paramNames = [] self.params = [] for k, v in initParams.iteritems(): if type(v) is numpy.ndarray: self.addParam(k, v) else: setattr(self, k, v) self.paramNames.append(k) # F*ck, locals()[k] = v doesn't work; I have to do this statically Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, dropout, momentum, frontEnd \ = self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.dropout, self.momentum, self.frontEnd else: # Initialize parameters randomly # Names of parameters to save to file self.paramNames = [ "Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "dropout", "momentum", "frontEnd" ] for name in self.paramNames: value = locals()[name] setattr(self, name, value) # Values of parameters for building the computational graph self.params = [] # Initialize random number generators global rng rng = numpy.random.RandomState(seed) theano_rng = RandomStreams(seed) # Construct parameter matrices Nlstm = 4 if Ah == 'lstm' else 1 self.addParam("Win", rand_init((Nx, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wrec", rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah)) self.addParam( "Wup", rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wout", rand_init((Nh * Ndirs, Ny), Ay)) if Ah != "lstm": self.addParam("Bhid", zeros((Nlayers, Nh * Ndirs))) else: self.addParam( "Bhid", numpy.tile( numpy.hstack([ full((Nlayers, Nh), 1.0), zeros((Nlayers, Nh * 3)) ]), (1, Ndirs))) self.addParam("Bout", zeros(Ny)) self.addParam("h0", zeros((Nlayers, Ndirs, Nh))) if Ah == "lstm": self.addParam("c0", zeros((Nlayers, Ndirs, Nh))) # Compute total number of parameters self.nParams = sum(x.get_value().size for x in self.params) # Initialize gradient tensors when using momentum if momentum > 0: self.dparams = [ theano.shared(zeros(x.get_value().shape)) for x in self.params ] # Build computation graph input = T.ftensor3() mask = T.imatrix() mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()] mask_float = [ T.cast((mask % 2).dimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX), T.cast((mask >= 2).dimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX) ] # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()] # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), # T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))] def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0): c_tm1 = T.switch(mask, c0, c_tm1) h_tm1 = T.switch(mask, h0, h_tm1) a = x_t + h_tm1.dot(W) f_t = T.nnet.sigmoid(a[:, :Nh]) i_t = T.nnet.sigmoid(a[:, Nh:Nh * 2]) o_t = T.nnet.sigmoid(a[:, Nh * 2:Nh * 3]) c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t h_t = T.tanh(c_t) * o_t return [c_t, h_t] x = input if frontEnd is None else frontEnd(input) def forward_pass(x, dropout): if dropout != 0.0: x *= theano_rng.binomial( n=1, p=1 - dropout, size=x.shape, dtype=theano.config.floatX) / (1 - dropout) for i in range(Nlayers): h = (x.dimshuffle((1, 0, 2)).dot(self.Win) if i == 0 else h.dot(self.Wup[i - 1])) + self.Bhid[i] rep = lambda x: T.extra_ops.repeat( x.reshape((1, -1)), h.shape[1], axis=0) if Ah != "lstm": h = T.concatenate([ theano.scan( fn=step_rnn, sequences=[ h[:, :, Nh * d:Nh * (d + 1)], mask_float[d] ], outputs_info=[rep(self.h0[i, d])], non_sequences=[ self.Wrec[i, d], rep(self.h0[i, d]) ], go_backwards=(d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) else: h = T.concatenate([ theano.scan( fn=step_lstm, sequences=[ h[:, :, Nh * 4 * d:Nh * 4 * (d + 1)], mask_float[d] ], outputs_info=[ rep(self.c0[i, d]), rep(self.h0[i, d]) ], non_sequences=[ self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d]) ], go_backwards=(d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) if dropout != 0.0: h *= theano_rng.binomial( n=1, p=1 - dropout, size=h.shape, dtype=theano.config.floatX) / (1 - dropout) h = h.dimshuffle((1, 0, 2)) if predictPer == "sequence": h = T.concatenate([ h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)] for d in range(Ndirs) ], axis=1) return ACTIVATION[Ay](h.dot(self.Wout) + self.Bout) output = forward_pass(x, 0.0) output_dropout = output if dropout == 0.0 else forward_pass(x, dropout) # Compute loss function if loss is None: loss = { "linear": "mse", "sigmoid": "ce", "softmax": "ce_group" }[self.Ay] if loss == "ctc": label = T.imatrix() label_time = T.imatrix() tol = T.iscalar() cost = ctc_cost(output_dropout, mask, label, label_time, tol) else: if predictPer == "sequence": label = T.fmatrix() y = output_dropout t = label elif predictPer == "frame": label = T.ftensor3() indices = (mask >= 0).nonzero() y = output_dropout[indices] t = label[indices] cost = T.mean({ "ce": -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis=1), "ce_group": -T.log((y * t).sum(axis=1)), "mse": T.mean((y - t)**2, axis=1), "hinge": T.mean(relu(1 - y * (t * 2 - 1)), axis=1), "squared_hinge": T.mean(relu(1 - y * (t * 2 - 1))**2, axis=1), }[loss]) # Add regularization cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg # Compute updates for network parameters updates = [] lrate = T.fscalar() clip = T.fscalar() grad = T.grad(cost, self.params) grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad] if momentum > 0: for w, d, g in zip(self.params, self.dparams, grad_clipped): updates.append( (w, w + momentum * momentum * d - (1 + momentum) * lrate * g)) updates.append((d, momentum * d - lrate * g)) else: for w, g in zip(self.params, grad_clipped): updates.append((w, w - lrate * g)) # Create functions to be called from outside if loss == "ctc": inputs = [input, mask, label, label_time, tol, lrate, clip] else: inputs = [input, mask, label, lrate, clip] self.train = theano.function( inputs=inputs, outputs=cost, updates=updates, ) self.predict = theano.function(inputs=[input, mask], outputs=output)
pca = lambda x: ((x[:, mask] - mu) / sigma).dot(V) * w + b # Predict for each recording for filename in os.listdir(INPUT_DIR): conf = {} print "Filename {}".format(filename) id, ext = os.path.splitext(filename) if ext != '.htk': continue print "Predicting for {} ...".format(id) feature = pca(readHtk(os.path.join(INPUT_DIR, filename))).astype('float32') x = feature.reshape((1, ) + feature.shape) m = numpy.ones(x.shape[:-1], dtype='int32') conf[id] = net.predict(x, m)[0] # Save predictions with smart_open(os.path.join(OUTPUT_DIR, id + '.confidence.pkl.gz'), 'wb') as f: cPickle.dump(conf, f) savemat(os.path.join(OUTPUT_DIR, id + '.confidence.mat'), conf) result_ = conf[id] # Add classes 1 and 2 (speech english and speech non english) # to create a class " Speech " result = numpy.zeros((result_.shape[0], result_.shape[1] - 1)) result[:, 0] = result_[:, 0] result[:, 1] = result_[:, 1] + result_[:, 2] result[:, 2:] = result_[:, 3:] # Output RTTM most_likely = result.argmax(axis=1) confidence = result.max(axis=1)