Exemplo n.º 1
0
def readHtk(filename, chunk_size=None, preSamples=None):
    """
    Reads the features in a HTK file, and returns them in a 2-D numpy array.
    chunk_size: integer specifying number of samples per chunk.
    preSamples: integer specifying the number of samples to prepend to
                a chunk to try and deal with issues at chunk boundaries.
                Safe to assume that if chunk_size is not None, preSamples
                will also not be None.
    """
    # Only do chunking if chunk_size is passed to the function.
    if chunk_size is not None:
        assert chunk_size > 0, "chunk_size needs to be > 0"
        with smart_open(filename, "rb") as f:
            nSamples, sampPeriod, sampSize, parmKind = struct.unpack(
                ">iihh", f.read(12))
            assert nSamples > 0, "nSamples needs to be > 0"
            assert sampSize > 0, "sampSize needs to be > 0"

            # If the size of the features is less than the chunk size.
            if nSamples < chunk_size:
                chunk_size = nSamples

            # Iterate over all full chunks first.
            for i in range(nSamples // chunk_size):
                # We want to add a few samples to the beginning of each chunk,
                # but only after the first one.
                if i == 0:
                    readSize = chunk_size * sampSize
                    dataSize = readSize // 4
                    outputSize = chunk_size
                else:
                    readSize = (chunk_size + preSamples) * sampSize
                    dataSize = readSize // 4
                    outputSize = chunk_size + preSamples

                data = struct.unpack(">%df" % (dataSize), f.read(readSize))
                yield numpy.array(data).reshape(outputSize, sampSize // 4)

                # Move the file cursor back so that the next chunk reuses
                # some of the same samples.
                f.seek(-(preSamples * sampSize), 1)

            # Whatever remains after the last full chunk size.
            chunk_size = nSamples - (chunk_size *
                                     (nSamples // chunk_size)) + preSamples
            if chunk_size > preSamples:
                data = struct.unpack(">%df" % (chunk_size * sampSize / 4),
                                     f.read(chunk_size * sampSize))
                yield numpy.array(data).reshape(chunk_size, sampSize // 4)
    else:
        with smart_open(filename, "rb") as f:
            #Read header
            nSamples, sampPeriod, sampSize, parmKind = struct.unpack(
                ">iihh", f.read(12))
            # Read data
            data = struct.unpack(">%df" % (nSamples * sampSize / 4),
                                 f.read(nSamples * sampSize))
            # print(type(data), len(data), nSamples, sampSize, nSamples*sampSize/4)
            yield numpy.array(data).reshape(nSamples, sampSize // 4)
Exemplo n.º 2
0
def readFmatrix(filename):
    """
    Reads a float matrix from a Janus feature file.
    """
    with smart_open(filename, "rb") as f:
        _, rows, cols, _ = struct.unpack(">4i", f.read(16))
        return numpy.array(struct.unpack(">%df" % (rows * cols), f.read())).reshape(rows, cols)
Exemplo n.º 3
0
def writeFmatrix(filename, matrix):
    """
    Writes a float matrix to a Janus feature file.
    """
    with smart_open(filename, "wb") as f:
        f.write("FMAT")
        f.write(struct.pack(">3i", matrix.shape[0], matrix.shape[1], 0))
        f.write(struct.pack(">%df" % matrix.size, *matrix.ravel()))
Exemplo n.º 4
0
def readFmatrix(filename):
    """
    Reads a float matrix from a Janus feature file.
    """
    with smart_open(filename, "rb") as f:
        _, rows, cols, _ = struct.unpack(">4i", f.read(16))
        return numpy.array(struct.unpack(">%df" % (rows * cols),
                                         f.read())).reshape(rows, cols)
Exemplo n.º 5
0
def writeScp(filename, uttids, pointers):
    """
    Takes a list of utterance IDs and a list of strings in the format "filename:offset",
      and writes them to a Kaldi script file.
    """
    with smart_open(filename, "w") as f:
        for uttid, pointer in zip(uttids, pointers):
            f.write("%s %s\n" % (uttid, pointer))
Exemplo n.º 6
0
def writeFmatrix(filename, matrix):
    """
    Writes a float matrix to a Janus feature file.
    """
    with smart_open(filename, "wb") as f:
        f.write("FMAT")
        f.write(struct.pack(">3i", matrix.shape[0], matrix.shape[1], 0))
        f.write(struct.pack(">%df" % matrix.size, *matrix.ravel()))
Exemplo n.º 7
0
def readScp(filename, limit = numpy.inf):
    """
    Reads the features in a Kaldi script file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []; uttids = []
    with smart_open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p+1:])
            with smart_open(arkfile, "rb") as g:
                g.seek(offset)
                feature = readMatrix(g)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit: break
    return features, uttids
Exemplo n.º 8
0
def readPfile(filename):
    """
    Reads the contents of a pfile. Returns a tuple (features, labels), where
    both elements are lists of 2-D numpy arrays. Each element of a list
    corresponds to a sentence; each row of a 2-D array corresponds to a frame.
    In the case where the pfile doesn't contain labels, "labels" will be None.
    """

    with smart_open(filename, "rb") as f:
        # Read header
        # Assuming all data are consistent
        for line in f:
            tokens = line.split()
            if tokens[0] == "-pfile_header":
                headerSize = int(tokens[4])
            elif tokens[0] == "-num_sentences":
                nSentences = int(tokens[1])
            elif tokens[0] == "-num_frames":
                nFrames = int(tokens[1])
            elif tokens[0] == "-first_feature_column":
                cFeature = int(tokens[1])
            elif tokens[0] == "-num_features":
                nFeatures = int(tokens[1])
            elif tokens[0] == "-first_label_column":
                cLabel = int(tokens[1])
            elif tokens[0] == "-num_labels":
                nLabels = int(tokens[1])
            elif tokens[0] == "-format":
                format = tokens[1].replace("d", "i")
            elif tokens[0] == "-end":
                break
        nCols = len(format)
        dataSize = nFrames * nCols

        # Read sentence index
        f.seek(headerSize + dataSize * 4)
        index = struct.unpack(">%di" % (nSentences + 1),
                              f.read(4 * (nSentences + 1)))

        # Read data
        f.seek(headerSize)
        features = []
        labels = []
        sen = 0
        for i in xrange(nFrames):
            if i == index[sen]:
                features.append([])
                labels.append([])
                sen += 1
            data = struct.unpack(">" + format, f.read(4 * nCols))
            features[-1].append(data[cFeature:cFeature + nFeatures])
            labels[-1].append(data[cLabel:cLabel + nLabels])
        features = [numpy.array(x) for x in features]
        labels = [numpy.array(x) for x in labels] if nLabels > 0 else None

    return (features, labels)
Exemplo n.º 9
0
def readPfile(filename):
    """
    Reads the contents of a pfile. Returns a tuple (feature, label), where
    both elements are lists of 2-D numpy arrays. Each element of a list
    corresponds to a sentence; each row of a 2-D array corresponds to a frame.
    In the case where the pfile doesn't contain labels, "label" will be None.
    """

    with smart_open(filename, "rb") as f:
        # Read header
        # Assuming all data are consistent
        for line in f:
            tokens = line.split()
            if tokens[0] == "-pfile_header":
                headerSize = int(tokens[4])
            elif tokens[0] == "-num_sentences":
                nSentences = int(tokens[1])
            elif tokens[0] == "-num_frames":
                nFrames = int(tokens[1])
            elif tokens[0] == "-first_feature_column":
                cFeature = int(tokens[1])
            elif tokens[0] == "-num_features":
                nFeatures = int(tokens[1])
            elif tokens[0] == "-first_label_column":
                cLabel = int(tokens[1])
            elif tokens[0] == "-num_labels":
                nLabels = int(tokens[1])
            elif tokens[0] == "-format":
                format = tokens[1].replace("d", "i")
            elif tokens[0] == "-end":
                break
        nCols = len(format)
        dataSize = nFrames * nCols

        # Read sentence index
        f.seek(headerSize + dataSize * 4)
        index = struct.unpack(">%di" % (nSentences + 1), f.read(4 * (nSentences + 1)))

        # Read data
        f.seek(headerSize)
        feature = []
        label = []
        sen = 0
        for i in xrange(nFrames):
            if i == index[sen]:
                feature.append([])
                label.append([])
                sen += 1
            data = struct.unpack(">" + format, f.read(4 * nCols))
            feature[-1].append(data[cFeature : cFeature + nFeatures])
            label[-1].append(data[cLabel : cLabel + nLabels])
        feature = [numpy.array(x) for x in feature]
        label = [numpy.array(x) for x in label] if nLabels > 0 else None

    return (feature, label)
Exemplo n.º 10
0
def writeHtk(filename, feature, sampPeriod, parmKind):
    """
    Writes the features in a 2-D numpy array into a HTK file.
    """
    with smart_open(filename, "wb") as f:
        # Write header
        nSamples = feature.shape[0]
        sampSize = feature.shape[1] * 4
        f.write(struct.pack(">iihh", nSamples, sampPeriod, sampSize, parmKind))

        # Write data
        f.write(struct.pack(">%df" % (nSamples * sampSize / 4), *feature.ravel()))
Exemplo n.º 11
0
def writePfile(filename, features, labels=None):
    """
    Writes "features" and "labels" to a pfile. Both "features" and "labels"
    should be lists of 2-D numpy arrays. Each element of a list corresponds
    to a sentence; each row of a 2-D array corresponds to a frame. In the case
    where there is only one label per frame, the elements of the "labels" list
    can be 1-D arrays.
    """

    nSentences = len(features)
    nFrames = sum(len(x) for x in features)
    nFeatures = len(numpy.array(features[0][0]).ravel())
    nLabels = len(numpy.array(
        labels[0][0]).ravel()) if labels is not None else 0
    nCols = 2 + nFeatures + nLabels
    headerSize = 32768
    dataSize = nFrames * nCols

    with smart_open(filename, "wb") as f:
        # Write header
        writeBytes(f, "-pfile_header version 0 size %d\n" % headerSize)
        writeBytes(f, "-num_sentences %d\n" % nSentences)
        writeBytes(f, "-num_frames %d\n" % nFrames)
        writeBytes(f, "-first_feature_column 2\n")
        writeBytes(f, "-num_features %d\n" % nFeatures)
        writeBytes(f, "-first_label_column %d\n" % (2 + nFeatures))
        writeBytes(f, "-num_labels %d\n" % nLabels)
        writeBytes(f, "-format dd" + "f" * nFeatures + "d" * nLabels + "\n")
        writeBytes(
            f, "-data size %d offset 0 ndim 2 nrow %d ncol %d\n" %
            (dataSize, nFrames, nCols))
        writeBytes(
            f, "-sent_table_data size %d offset %d ndim 1\n" %
            (nSentences + 1, dataSize))
        writeBytes(f, "-end\n")

        # Write data
        f.seek(headerSize)
        for i in range(nSentences):
            for j in range(len(features[i])):
                f.write(struct.pack(">2i", i, j))
                f.write(
                    struct.pack(">%df" % nFeatures,
                                *numpy.array(features[i][j]).ravel()))
                if labels is not None:
                    f.write(
                        struct.pack(
                            ">%di" % nLabels,
                            *numpy.array(labels[i][j].astype(int)).ravel()))

        # Write sentence index
        index = numpy.cumsum([0] + [len(x) for x in features])
        f.write(struct.pack(">%di" % (nSentences + 1), *index))
Exemplo n.º 12
0
def readHtk(filename):
    """
    Reads the features in a HTK file, and returns them in a 2-D numpy array.
    """

    with smart_open(filename, "rb") as f:
        # Read header
        nSamples, sampPeriod, sampSize, parmKind = struct.unpack(">iihh", f.read(12))
            # sampPeriod and parmKind will be omitted

        # Read data
        data = struct.unpack(">%df" % (nSamples * sampSize / 4), f.read(nSamples * sampSize))
        return numpy.array(data).reshape(nSamples, sampSize / 4)
Exemplo n.º 13
0
def writeHtk(filename, feature, sampPeriod, parmKind):
    """
    Writes the features in a 2-D numpy array into a HTK file.
    """
    with smart_open(filename, "wb") as f:
        # Write header
        nSamples = feature.shape[0]
        sampSize = feature.shape[1] * 4
        f.write(struct.pack(">iihh", nSamples, sampPeriod, sampSize, parmKind))

        # Write data
        f.write(
            struct.pack(">%df" % (nSamples * sampSize / 4), *feature.ravel()))
Exemplo n.º 14
0
def writeArk(filename, features, uttids):
    """
    Takes a list of feature matrices and a list of utterance IDs,
      and writes them to a Kaldi ark file.
    Returns a list of strings in the format "filename:offset",
      which can be used to write a Kaldi script file.
    """
    pointers = []
    with smart_open(filename, "wb") as f:
        for feature, uttid in zip(features, uttids):
            writeString(f, uttid)
            pointers.append("%s:%d" % (filename, f.tell()))
            writeMatrix(f, feature)
    return pointers
Exemplo n.º 15
0
def readHtk(filename):
    """
    Reads the features in a HTK file, and returns them in a 2-D numpy array.
    """

    with smart_open(filename, "rb") as f:
        # Read header
        nSamples, sampPeriod, sampSize, parmKind = struct.unpack(
            ">iihh", f.read(12))
        # sampPeriod and parmKind will be omitted

        # Read data
        data = struct.unpack(">%df" % (nSamples * sampSize / 4),
                             f.read(nSamples * sampSize))
        return numpy.array(data).reshape(nSamples, int(sampSize / 4))
Exemplo n.º 16
0
def readArk(filename, limit = numpy.inf):
    """
    Reads the features in a Kaldi ark file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []; uttids = []
    with smart_open(filename, "rb") as f:
        while True:
            try:
                uttid = readString(f)
            except ValueError:
                break
            feature = readMatrix(f)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit: break
    return features, uttids
Exemplo n.º 17
0
def writePfile(filename, feature, label=None):
    """
    Writes "feature" and "label" to a pfile. Both inputs "feature" and "label"
    should be lists of 2-D numpy arrays. Each element of a list corresponds
    to a sentence; each row of a 2-D array corresponds to a frame. In the case
    where there is only one label per frame, the elements of the "label" list
    can be 1-D arrays.
    """

    nSentences = len(feature)
    nFrames = sum(len(x) for x in feature)
    nFeatures = len(numpy.array(feature[0][0]).ravel())
    nLabels = len(numpy.array(label[0][0]).ravel()) if label is not None else 0
    nCols = 2 + nFeatures + nLabels
    headerSize = 32768
    dataSize = nFrames * nCols

    with smart_open(filename, "wb") as f:
        # Write header
        f.write("-pfile_header version 0 size %d\n" % headerSize)
        f.write("-num_sentences %d\n" % nSentences)
        f.write("-num_frames %d\n" % nFrames)
        f.write("-first_feature_column 2\n")
        f.write("-num_features %d\n" % nFeatures)
        f.write("-first_label_column %d\n" % (2 + nFeatures))
        f.write("-num_labels %d\n" % nLabels)
        f.write("-format dd" + "f" * nFeatures + "d" * nLabels + "\n")
        f.write("-data size %d offset 0 ndim 2 nrow %d ncol %d\n" % (dataSize, nFrames, nCols))
        f.write("-sent_table_data size %d offset %d ndim 1\n" % (nSentences + 1, dataSize))
        f.write("-end\n")

        # Write data
        f.seek(headerSize)
        for i in xrange(nSentences):
            for j in xrange(len(feature[i])):
                f.write(struct.pack(">2i", i, j))
                f.write(struct.pack(">%df" % nFeatures, *numpy.array(feature[i][j]).ravel()))
                if label is not None:
                    f.write(struct.pack(">%di" % nLabels, *numpy.array(label[i][j]).ravel()))

        # Write sentence index
        index = numpy.cumsum([0] + [len(x) for x in feature])
        f.write(struct.pack(">%di" % (nSentences + 1), *index))
Exemplo n.º 18
0
def writeAudioSet(filename, wav, labels):
    """
    Writes audio and labels to a Google Audio Set file (disguised as .flac).
    Takes two variables as input:
      * wav -- a 2-D numpy array, where each row is a waveform
        (10 seconds @ 16 kHz, mono, dtype is arbitrary);
      * labels -- a 2-D numpy array of zeros and ones, where each row
        indicates the sound events active in the corresponding waveform.
    The number of rows in the two variables must match.
    The audio is concatenated and compressed in the FLAC format.
    The labels are appended to the FLAC audio file.
    This function relies on ffmpeg.
    """

    # Validate input
    if len(wav) != len(labels):
        raise ValueError(
            "The number of rows in 'wav' and 'labels' must match.")

    # Convert wav to int16, ensuring the correct range
    nClips, nSamples = wav.shape
    if numpy.abs(wav).max() <= 1: wav *= 32768
    wav = numpy.maximum(numpy.minimum(wav, 32767), -32768).astype("int16")

    # Convert labels to bit arrays
    nLabels = labels.shape[1]
    labels = labels.astype("uint8")
    nBytes = (nLabels - 1) / 8 + 1
    bytes = numpy.zeros((nClips, nBytes), dtype="uint8")
    for i in xrange(nLabels):
        bytes[:, i / 8] += labels[:, i] << (i % 8)

    # Write file
    wavfile.write(filename + ".wav", 16000, wav.ravel())
    subprocess.check_output("ffmpeg -i %s.wav -c:a flac -y %s && rm %s.wav" %
                            (filename, filename, filename),
                            shell=True)
    with smart_open(filename, "ab") as f:
        f.write(struct.pack("<%dB" % bytes.size, *bytes.ravel()))
        f.write(struct.pack("<3i", nClips, nSamples, nLabels))
Exemplo n.º 19
0
def readAudioSet(filename):
    """
    Reads audio and labels from a Google Audio Set file (disguised as .flac).
    Returns two variables:
      * wav -- a 2-D numpy float32 array, where each row is a waveform
        (10 seconds @ 16 kHz, mono);
      * labels -- a 2-D numpy int32 array of zeros and ones, where each row
        indicates the sound events active in the corresponding waveform.
    """
    wav, _ = librosa.core.load(filename, sr=16000, dtype="float32")
    with smart_open(filename, "rb") as f:
        f.seek(-12, 2)
        nClips, nSamples, nLabels = struct.unpack("<3i", f.read(12))
        wav = wav.reshape(nClips, nSamples)
        nBytes = (nLabels - 1) / 8 + 1
        f.seek(-12 - nClips * nBytes, 2)
        data = struct.unpack("<%dB" % (nClips * nBytes),
                             f.read(nClips * nBytes))
        bytes = numpy.array(data).reshape(nClips, nBytes)
        labels = numpy.zeros((nClips, nLabels), dtype="int32")
        for i in xrange(nLabels):
            labels[:, i] = (bytes[:, i / 8] >> (i % 8)) & 1
        return wav, labels
Exemplo n.º 20
0
 def save(self, filename):
     with smart_open(filename, "wb") as f:
         dill.dump(self.getParams(), f)
Exemplo n.º 21
0
    def __init__(
            self,
            Nlayers=1,  # number of layers
            Ndirs=1,  # unidirectional or bidirectional
            Nx=100,  # input size
            Nh=100,  # hidden layer size
            Ny=100,  # output size
            Ah="relu",  # hidden unit activation (e.g. relu, tanh, lstm)
            Ay="linear",  # output unit activation (e.g. linear, sigmoid, softmax)
            predictPer="frame",  # frame or sequence
            loss=None,  # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge)
            L1reg=0.0,  # L1 regularization
            L2reg=0.0,  # L2 regularization
            dropout=0.0,  # dropout
            momentum=0.0,  # SGD momentum
            seed=15213,  # random seed for initializing the weights
            frontEnd=None,  # a lambda function for transforming the input
            filename=None,  # initialize from file
            initParams=None,  # initialize from given dict
    ):

        if filename is not None:  # load parameters from file
            with smart_open(filename, "rb") as f:
                initParams = dill.load(f)
        if initParams is not None:  # load parameters from given dict
            self.paramNames = []
            self.params = []
            for k, v in initParams.iteritems():
                if type(v) is numpy.ndarray:
                    self.addParam(k, v)
                else:
                    setattr(self, k, v)
                    self.paramNames.append(k)
            # F*ck, locals()[k] = v doesn't work; I have to do this statically
            Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, dropout, momentum, frontEnd \
                = self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.dropout, self.momentum, self.frontEnd
        else:  # Initialize parameters randomly
            # Names of parameters to save to file
            self.paramNames = [
                "Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer",
                "loss", "L1reg", "L2reg", "dropout", "momentum", "frontEnd"
            ]
            for name in self.paramNames:
                value = locals()[name]
                setattr(self, name, value)

            # Values of parameters for building the computational graph
            self.params = []

            # Initialize random number generators
            global rng
            rng = numpy.random.RandomState(seed)
            theano_rng = RandomStreams(seed)

            # Construct parameter matrices
            Nlstm = 4 if Ah == 'lstm' else 1
            self.addParam("Win", rand_init((Nx, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wrec",
                          rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah))
            self.addParam(
                "Wup",
                rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wout", rand_init((Nh * Ndirs, Ny), Ay))
            if Ah != "lstm":
                self.addParam("Bhid", zeros((Nlayers, Nh * Ndirs)))
            else:
                self.addParam(
                    "Bhid",
                    numpy.tile(
                        numpy.hstack([
                            full((Nlayers, Nh), 1.0),
                            zeros((Nlayers, Nh * 3))
                        ]), (1, Ndirs)))
            self.addParam("Bout", zeros(Ny))
            self.addParam("h0", zeros((Nlayers, Ndirs, Nh)))
            if Ah == "lstm":
                self.addParam("c0", zeros((Nlayers, Ndirs, Nh)))

        # Compute total number of parameters
        self.nParams = sum(x.get_value().size for x in self.params)

        # Initialize gradient tensors when using momentum
        if momentum > 0:
            self.dparams = [
                theano.shared(zeros(x.get_value().shape)) for x in self.params
            ]

        # Build computation graph
        input = T.ftensor3()
        mask = T.imatrix()
        mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()]
        mask_float = [
            T.cast((mask % 2).dimshuffle((1, 0)).reshape(
                (mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
            T.cast((mask >= 2).dimshuffle((1, 0)).reshape(
                (mask.shape[1], mask.shape[0], 1)), theano.config.floatX)
        ]

        # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()]
        # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
        #               T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]

        def step_rnn(x_t, mask, h_tm1, W, h0):
            h_tm1 = T.switch(mask, h0, h_tm1)
            return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]

        def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0):
            c_tm1 = T.switch(mask, c0, c_tm1)
            h_tm1 = T.switch(mask, h0, h_tm1)
            a = x_t + h_tm1.dot(W)
            f_t = T.nnet.sigmoid(a[:, :Nh])
            i_t = T.nnet.sigmoid(a[:, Nh:Nh * 2])
            o_t = T.nnet.sigmoid(a[:, Nh * 2:Nh * 3])
            c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t
            h_t = T.tanh(c_t) * o_t
            return [c_t, h_t]

        x = input if frontEnd is None else frontEnd(input)

        def forward_pass(x, dropout):
            if dropout != 0.0:
                x *= theano_rng.binomial(
                    n=1,
                    p=1 - dropout,
                    size=x.shape,
                    dtype=theano.config.floatX) / (1 - dropout)
            for i in range(Nlayers):
                h = (x.dimshuffle((1, 0, 2)).dot(self.Win)
                     if i == 0 else h.dot(self.Wup[i - 1])) + self.Bhid[i]
                rep = lambda x: T.extra_ops.repeat(
                    x.reshape((1, -1)), h.shape[1], axis=0)
                if Ah != "lstm":
                    h = T.concatenate([
                        theano.scan(
                            fn=step_rnn,
                            sequences=[
                                h[:, :, Nh * d:Nh * (d + 1)], mask_float[d]
                            ],
                            outputs_info=[rep(self.h0[i, d])],
                            non_sequences=[
                                self.Wrec[i, d],
                                rep(self.h0[i, d])
                            ],
                            go_backwards=(d == 1),
                        )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)
                    ],
                                      axis=2)
                else:
                    h = T.concatenate([
                        theano.scan(
                            fn=step_lstm,
                            sequences=[
                                h[:, :, Nh * 4 * d:Nh * 4 *
                                  (d + 1)], mask_float[d]
                            ],
                            outputs_info=[
                                rep(self.c0[i, d]),
                                rep(self.h0[i, d])
                            ],
                            non_sequences=[
                                self.Wrec[i, d],
                                rep(self.c0[i, d]),
                                rep(self.h0[i, d])
                            ],
                            go_backwards=(d == 1),
                        )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)
                    ],
                                      axis=2)
                if dropout != 0.0:
                    h *= theano_rng.binomial(
                        n=1,
                        p=1 - dropout,
                        size=h.shape,
                        dtype=theano.config.floatX) / (1 - dropout)
            h = h.dimshuffle((1, 0, 2))
            if predictPer == "sequence":
                h = T.concatenate([
                    h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)]
                    for d in range(Ndirs)
                ],
                                  axis=1)
            return ACTIVATION[Ay](h.dot(self.Wout) + self.Bout)

        output = forward_pass(x, 0.0)
        output_dropout = output if dropout == 0.0 else forward_pass(x, dropout)

        # Compute loss function
        if loss is None:
            loss = {
                "linear": "mse",
                "sigmoid": "ce",
                "softmax": "ce_group"
            }[self.Ay]
        if loss == "ctc":
            label = T.imatrix()
            label_time = T.imatrix()
            tol = T.iscalar()
            cost = ctc_cost(output_dropout, mask, label, label_time, tol)
        else:
            if predictPer == "sequence":
                label = T.fmatrix()
                y = output_dropout
                t = label
            elif predictPer == "frame":
                label = T.ftensor3()
                indices = (mask >= 0).nonzero()
                y = output_dropout[indices]
                t = label[indices]
            cost = T.mean({
                "ce":
                -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis=1),
                "ce_group":
                -T.log((y * t).sum(axis=1)),
                "mse":
                T.mean((y - t)**2, axis=1),
                "hinge":
                T.mean(relu(1 - y * (t * 2 - 1)), axis=1),
                "squared_hinge":
                T.mean(relu(1 - y * (t * 2 - 1))**2, axis=1),
            }[loss])

        # Add regularization
        cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg
        cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg

        # Compute updates for network parameters
        updates = []
        lrate = T.fscalar()
        clip = T.fscalar()
        grad = T.grad(cost, self.params)
        grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad]
        if momentum > 0:
            for w, d, g in zip(self.params, self.dparams, grad_clipped):
                updates.append(
                    (w,
                     w + momentum * momentum * d - (1 + momentum) * lrate * g))
                updates.append((d, momentum * d - lrate * g))
        else:
            for w, g in zip(self.params, grad_clipped):
                updates.append((w, w - lrate * g))

        # Create functions to be called from outside
        if loss == "ctc":
            inputs = [input, mask, label, label_time, tol, lrate, clip]
        else:
            inputs = [input, mask, label, lrate, clip]
        self.train = theano.function(
            inputs=inputs,
            outputs=cost,
            updates=updates,
        )

        self.predict = theano.function(inputs=[input, mask], outputs=output)
Exemplo n.º 22
0
        pca = lambda x: ((x[:, mask] - mu) / sigma).dot(V) * w + b

# Predict for each recording
for filename in os.listdir(INPUT_DIR):
    conf = {}
    print "Filename {}".format(filename)
    id, ext = os.path.splitext(filename)
    if ext != '.htk': continue
    print "Predicting for {} ...".format(id)
    feature = pca(readHtk(os.path.join(INPUT_DIR, filename))).astype('float32')
    x = feature.reshape((1, ) + feature.shape)
    m = numpy.ones(x.shape[:-1], dtype='int32')
    conf[id] = net.predict(x, m)[0]

    # Save predictions
    with smart_open(os.path.join(OUTPUT_DIR, id + '.confidence.pkl.gz'),
                    'wb') as f:
        cPickle.dump(conf, f)
        savemat(os.path.join(OUTPUT_DIR, id + '.confidence.mat'), conf)

    result_ = conf[id]

    # Add classes 1 and 2 (speech english and speech non english)
    # to create a class " Speech "
    result = numpy.zeros((result_.shape[0], result_.shape[1] - 1))
    result[:, 0] = result_[:, 0]
    result[:, 1] = result_[:, 1] + result_[:, 2]
    result[:, 2:] = result_[:, 3:]
    # Output RTTM
    most_likely = result.argmax(axis=1)
    confidence = result.max(axis=1)