Пример #1
0
    def mean_var_norm_per_file(self, h5f, mvn_h5f, vad_file=None):
        # normalize either per channel or on the whole spectrum.
        axis = 0 if self.norm_per_channel else None

        dset_name = list(h5py.File(h5f).keys())[0]
        files = h5py.File(h5f)[dset_name]['items']
        reader = h5features.Reader(h5f)
        means_vars = []
        for f in files:
            data = reader.read(from_item=f)
            items, features, times = (data.items(), data.features()[0],
                                      data.labels()[0])
            # VAD
            filtered_features = None
            if vad_file is not None:
                vad_data = read_vad_file(vad_file)
                if str(f) in vad_data:
                    filtered_features = self.filter_vad_one_file(
                        features, times, vad_data[str(f)])

            if filtered_features is None:
                mean = np.mean(features, axis=axis)
                std = np.std(features, axis=axis)
            else:
                mean = np.mean(filtered_features, axis=axis)
                std = np.std(filtered_features, axis=axis)
            features = (features - mean) / (std + np.finfo(features.dtype).eps)
            h5features.write(mvn_h5f, '/features/', items, [times], [features])
            means_vars.append((f, mean, std))
        return means_vars
Пример #2
0
def run(files, output_path, config_file, save, batch_size=50):
    """Split in the file list into batches. Handle arguments.

    Parameters:
    -----------
    batch_size: int, max batch size in number of files (adjust for RAM usage)
    """
    if 'h5' in save:
        import h5features
    # with open(config_file, 'r') as fid:
    #     config = json.load(fid)
    if config_file:
        raise NotImplementedError

    batches = [files[i:i + batch_size] for i in range(0, len(files), batch_size)]
    res = {}
    for files_batch in batches:
        new_res = extract_features(files_batch, delta=0)
        if 'np' in save:
            for f in new_res:
                np.save(output_path + f, new_res[f])
        if 'h5' in save:
            h5features.write(output_path, 'features', [new_res.values()],
                             map(lambda f: os.path.basename(f).split('.')[0],
                                 new_res.keys()),
                             map(lambda d: np.arange(d.shape[0], dtype=float) / 100 + 0.0125))
        res.update(new_res)
Пример #3
0
    def setup(self):
        self.file_v1 = 'v1.0.h5'
        self.file_v2 = 'v1.1.h5'
        self.teardown()  # in case files already exist, remove it

        items, times, features = generate.full(20, 10)
        h5f_1_0.write(self.file_v1, 'features', items, times, features)
        h5f_1_1.write(self.file_v2, 'features', items, times, features)
Пример #4
0
    def setup(self):
        self.file_v1 = 'v1.0.h5'
        self.file_v2 = 'v1.1.h5'
        self.teardown() # in case files already exist, remove it

        items, times, features = generate.full(20,10)
        h5f_1_0.write(self.file_v1, 'features', items, times, features)
        h5f_1_1.write(self.file_v2, 'features', items, times, features)
Пример #5
0
def transcription2features(phones_file,
                           tra_file,
                           out_file,
                           word_position_dependent=True):
    """
    Kaldi 1-best aligned transcription to h5features
    format in h5features is frame by frame, as this allows
    both frame-to-frame DTW distance and edit distance to be used
    (for edit_distance the first step would be extracting the phone-level
    sequence from the frame-level sequence, discarding segments that have
    too few frames)
    This avoids problems with long phones if coding only the centerpoint
    of a phone (a long time interval within the phone, but that does not
    include the centerpoint will have empty representation). Allowing 
    representations indexed by time intervals instead of time points could 
    be more elegant when one wants to use edit_distance but this would require
    some (substantial but not huge) recoding in h5features and ABXpy.distances.
    One would need to check that the time-intervals have no overlap and are
    consecutive and one would need to adapt the features reading to provide the
    sequence of consecutive feature vectors with their durations and for the first
    and last their degree of overlap with the required time segment.
    """
    phonemap = read_kaldi_phonemap(phones_file, word_position_dependent)
    # get order used to encode the phones as integer in the features files
    phone_order = get_phone_order(phonemap)
    utt_ids = []
    times = []
    features = []
    current_utt = None
    utt_times = []
    utt_features = []
    i = 1
    for utt_id, start, stop, phone in read_kaldi_alignment(phonemap, tra_file):
        print i
        i = i + 1
        if current_utt is None:
            current_utt = utt_id
        if utt_id != current_utt:
            utt_ids.append(current_utt)
            times.append(np.array(utt_times))
            nb_phones = len(utt_features)
            # not sure how h5features handles 1-d arrays, so reshaping
            features.append(np.array(utt_features).reshape((nb_phones, 1)))
            current_utt = utt_id
            utt_times = []
            utt_features = []
        else:
            # expanding to frame by frame using ad hoc 10ms window spacing
            # since start and stop are spaced by a multiple of 10ms due to
            # standard window spacing used by kaldi
            nframes = (stop - start) / 0.01
            assert np.abs(nframes -
                          np.round(nframes)) < 1e-7  # ad hoc tolerance
            nframes = int(np.round(nframes))
            utt_features = utt_features + [phone_order.index(phone)] * nframes
            frame_times = start + 0.01 * np.arange(nframes)
            utt_times = utt_times + list(frame_times)
    h5features.write(out_file, 'features', utt_ids, times, features)
Пример #6
0
def h5features_stack_fbanks(fbanks_file, stacked_fbanks_file):
    import h5features
    index = h5features.read_index(fbanks_file)
    files = index['files']
    for f in files:
        times, fbanks = h5features.read(
            fbanks_file, 'features', from_internal_file=f, index=index)
        stacked_fbanks = stack_fbanks(fbanks[f])
        h5features.write(stacked_fbanks_file, 'features', [f],
                         [times[f]], [stacked_fbanks])
Пример #7
0
def generate_features(n_files, n_feat=2, max_frames=3, name='data.features'):
    """Random feature file generator
    """
    features = []
    times = []
    files = []
    for i in xrange(n_files):
        n_frames = np.random.randint(max_frames) + 1
        features.append(np.random.randn(n_frames, n_feat))
        times.append(np.linspace(0, 1, n_frames))
        files.append('s%d' % i)
    h5features.write(name, 'features', files, times, features)
Пример #8
0
    def test_normalization_per_file(self):

        tempdir = Path(tempfile.mkdtemp())
        h5f = str(tempdir / 'h5.features')

        feature1 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), -1.)])
        feature2 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), 2.)])
        features = [feature1, feature2]
        items = ['file1', 'file2']
        times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025]
        times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025)

        h5features.write(h5f, '/features/', items, times, features)

        h5f_mean_var = str(tempdir / 'h5-normalized.features')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True)
        meansvars = features_generator.mean_var_norm_per_file(
            h5f, h5f_mean_var)

        assert meansvars[0][0] == 'file1'
        assert all(meansvars[0][1] == np.mean(feature1, axis=0))
        assert all(meansvars[0][2] == np.std(feature1, axis=0))

        assert meansvars[1][0] == 'file2'
        assert all(meansvars[1][1] == np.mean(feature2, axis=0))
        assert all(meansvars[1][2] == np.std(feature2, axis=0))

        reader = h5features.Reader(h5f_mean_var)
        data = reader.read()
        for file in data.items():
            assert np.mean(data.dict_features()[file]) == pytest.approx(0)
            assert np.std(data.dict_features()[file]) == pytest.approx(1)

        # no per channel
        tmp2 = str(tempdir / 'h5-tmp2')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=False)
        meansvars = features_generator.mean_var_norm_per_file(h5f, tmp2)

        assert meansvars == [
            ('file1', 0, np.std(feature1)),
            ('file2', 1.5, np.std(feature2)),
        ]

        reader = h5features.Reader(tmp2)
        data = reader.read()
        for file in data.items():
            assert np.mean(data.dict_features()[file]) == pytest.approx(0)
            assert np.std(data.dict_features()[file]) == pytest.approx(1)

        shutil.rmtree(str(tempdir))
Пример #9
0
    def _test_wr(self, labeldim):
        """Test retrieving labels and files after a write/read operation."""
        items, t_gold, feat = generate.full(self.nbitems, tformat=labeldim)
        write(self.filename, self.group, items, t_gold, feat)
        t, _ = read(self.filename, self.group)

        assert len(t) == self.nbitems
        if not labeldim == 1:
            assert all([tt.shape[1] == labeldim for tt in t.values()])

        # build a dict from gold to compare with t
        d = dict(zip(items, t_gold))
        for dd, tt in zip(d, t):
            assert tt == dd
Пример #10
0
def generate_features(n_files, n_feat=2, max_frames=3, name='data.features'):
    """Random feature file generator
    """
    if os.path.exists(name):
        os.remove(name)
    features = []
    times = []
    files = []
    for i in range(n_files):
        n_frames = np.random.randint(max_frames) + 1
        features.append(np.random.randn(n_frames, n_feat))
        times.append(np.linspace(0, 1, n_frames))
        files.append('s%d' % i)
    h5features.write(name, 'features', files, times, features)
Пример #11
0
    def _test_wr(self, labeldim):
        """Test retrieving labels and files after a write/read operation."""
        items, t_gold, feat = generate.full(self.nbitems, tformat=labeldim)
        write(self.filename, self.group, items, t_gold, feat)
        t, _ = read(self.filename, self.group)

        assert len(t) == self.nbitems
        if not labeldim == 1:
            assert all([tt.shape[1] == labeldim for tt in t.values()])

        # build a dict from gold to compare with t
        d = dict(zip(items, t_gold))
        for dd, tt in zip(d, t):
            assert tt == dd
Пример #12
0
def yaafe2features(wavefiles, out_file, feature_type='MFCC'):
    """Generate features with yaafe and put them in h5features format.

    Whole wavefiles are encoded as internal h5features files.
    To use them with abkhazia's ABX tasks, these need to be segmented
    according to an abkhazia segments.txt
    (abkhazia/utilities/segment_features.py can be used for this)

    Supported feature types:
    - 'MFCC' (default)
    - 'CMSP13' (cubic-root-compressed 13-frequency-channels Mel spectrogram)
    """
    assert feature_type in ['MFCC', 'CMSP13'], \
        'Unsupported feature_type {0}'.format(feature_type)

    feature_plan = ya.FeaturePlan(sample_rate=16000)
    if feature_type == 'MFCC':
        feat_name = 'mfcc'
        feature_plan.addFeature('{0}: MFCC blockSize=400 stepSize=160'.format(
            feat_name))  # 0.025s + 0.01s
    elif feature_type == 'CMSP13':
        feat_name = 'melsp'
        feature_plan.addFeature(
            '{0}: MelSpectrum MelNbFilters=13 blockSize=400 stepSize=160'.
            format(feat_name))  # 0.025s + 0.01s

    engine = ya.Engine()
    engine.load(feature_plan.getDataFlow())

    wav_ids = []
    times = []
    features = []
    for wavefile in wavefiles:
        wav_ids.append(p.splitext(p.basename(wavefile))[0])
        afp = ya.AudioFileProcessor()
        afp.processFile(engine, wavefile)
        feat_out = engine.readAllOutputs()[feat_name]

        if feature_type == 'CMSP13':
            # need to add compression by hand
            feat_out = np.power(feat_out, 1 / 3.)

        # times according to:
        # http://yaafe.sourceforge.net/features.html?highlight=mfcc#yaafefeatures.Frames
        nframes = feat_out.shape[0]
        # 0.01 here is ad hoc and dependent on 160 above
        times.append(0.01 * np.arange(nframes))
        features.append(feat_out)
    h5features.write(out_file, 'features', wav_ids, times, features)
Пример #13
0
def h5features_from_nparray(input_path,
                            h5f,
                            timefunc=None,
                            rm_last_number=False,
                            transpose=False):
    """Compute speech features (such as posteriogram) that are in numpy array 
    in h5features format.

    Parameters:
    ----------
    input_path: path of the directory containing the features of audio files in numpy array
    h5f: str. Name of the h5features file to create.
    timefunc: callable. Function that returns timestamps for the aforementionned
        features. By default, it assume a window length of 25 ms and a window
        step of 10 ms.
    rm_last_number :bool, wether or not to remove the last number in each file name
    (the filenames of posteriograms have an additional number compared to the audio filenames )  
    """
    filenames = [
        f for f in listdir(input_path) if os.path.splitext(f)[-1] == ".npy"
    ]
    batch_size = 500
    features = []
    times = []
    internal_files = []
    i = 0
    for f in filenames:
        data = np.load(input_path + f)
        if i == batch_size:
            h5features.write(h5f, "/features/", internal_files, times,
                             features)
            features = []
            times = []
            internal_files = []
            i = 0
        i = i + 1
        features.append(data)
        if timefunc == None:
            time = np.arange(data.shape[0], dtype=float) * 0.01 + 0.0025
        else:
            time = timefunc(data)
        times.append(time)
        if rm_last_number:
            name = os.path.splitext(f)[0]
            internal_files.append(os.path.basename(name))
        else:
            internal_files.append(os.path.basename(os.path.splitext(f)[0]))
    if features:
        h5features.write(h5f, "/features/", internal_files, times, features)
Пример #14
0
    def test_normalization(self):

        tempdir = Path(tempfile.mkdtemp())
        h5f = str(tempdir / 'h5.features')

        features = [np.full((100, 40), 1.0), np.full((150, 40), 2.0)]
        items = ['file1', 'file2']
        times = [
            np.arange(features[0].shape[0], dtype=np.float32) * 0.01 + 0.0025
        ]
        times.append(
            np.arange(features[1].shape[0], dtype=np.float32) * 0.01 + 0.0025)

        h5features.write(h5f, '/features/', items, times, features)

        features_generator = FeaturesGenerator(norm_per_channel=True)
        h5f_mean_var = str(tempdir / 'h5-normalized.features')
        mean, variance = features_generator.mean_variance_normalisation(
            h5f, h5f_mean_var)

        stacked_features = np.vstack(features)
        assert mean == pytest.approx(np.mean(stacked_features, axis=0))
        assert variance == pytest.approx(np.std(stacked_features, axis=0))

        # check that the new file has 0 mean and 1 variance
        dset = list(h5py.File(h5f_mean_var).keys())[0]
        data = h5py.File(h5f_mean_var)[dset]['features'][:]
        means = np.mean(data, axis=0)
        assert np.allclose(means, 0.0, atol=1e-6)
        assert np.std(data, axis=0) == pytest.approx(1.0, abs=1e-6)

        ## test normalization across all chanels
        tmp2 = str(tempdir / 'h5temp.h5')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_channel=False)
        mean, variance = features_generator.mean_variance_normalisation(
            h5f, tmp2)
        assert mean == pytest.approx(np.mean(stacked_features))
        assert variance == pytest.approx(np.std(np.vstack(features)))

        # check that the new file has 0 mean and 1 variance
        dset = list(h5py.File(tmp2).keys())[0]
        data = h5py.File(h5f_mean_var)[dset]['features'][:]
        assert np.mean(data) == pytest.approx(0, abs=1e-6)
        assert np.std(data) == pytest.approx(1)

        shutil.rmtree(str(tempdir))
Пример #15
0
def any_to_h5features(path,
                      files,
                      h5_filename,
                      h5_groupname,
                      batch_size=500,
                      load=np.load):
    """Append a list of npz files to a h5features file.

    Files must have a relative name to a directory precised by the 'path'
    argument.

    Parameters
    ----------
    path : str
        Path of the directory where the numpy files are stored.
    files : list of filename
        List of file to convert and append.
    h5_filename : filename
        The output h5features file.
    h5_groupname : str
        Name of the h5 group where to store the numpy files (use '/features/')
        for h5features files)
    batch_size : int
        Size of the writing buffer (in number of npz files). By default 500.

    """
    features = []
    times = []
    internal_files = []
    i = 0
    for f in files:
        if i == batch_size:
            h5features.write(h5_filename, h5_groupname, internal_files, times,
                             features)
            features = []
            times = []
            internal_files = []
            i = 0
        i = i + 1
        data = load(os.path.join(path, f))
        features.append(data['features'])
        times.append(data['time'])
        internal_files.append(os.path.splitext(f)[0])
    if features:
        h5features.write(h5_filename, h5_groupname, internal_files, times,
                         features)
Пример #16
0
def npz_to_h5features(path, files, h5_filename, h5_groupname, batch_size=500):
    """Append a list of npz files to a h5features file.

    Files must have a relative name to a directory precised by the 'path'
    argument.

    Parameters
    ----------
    path : str
        Path of the directory where the numpy files are stored.
    files : list of filename
        List of file to convert and append.
    h5_filename : filename
        The output h5features file.
    h5_groupname : str
        Name of the h5 group where to store the numpy files (use '/features/')
        for h5features files)
    batch_size : int
        Size of the writing buffer (in number of npz files). By default 500.
    """
    features = []
    times = []
    internal_files = []
    i = 0
    for f in files:
        if i == batch_size:
            h5features.write(
                h5_filename, h5_groupname, internal_files, times, features)

            features = []
            times = []
            internal_files = []
            i = 0

        i = i+1
        data = np.load(os.path.join(path, f))
        features.append(data['features'])
        times.append(data['time'])
        internal_files.append(os.path.splitext(f)[0])

    if features:
        h5features.write(
            h5_filename, h5_groupname, internal_files, times, features)
Пример #17
0
    def h5features_compute(self, files, h5f, featfunc=None, timefunc=None):
        """Compute mfcc or filterbanks (or other) in h5features format.

        Parameters:
        ----------
        files: list, list of files on which to compute the features. You must
            give the complete relative or absolute path of the wave file
        h5f: str. Name of the h5features file to create.
        featfunc: callable. "do_fbanks" to compute fbanks, "do_mfccs" to
            compute mfccs. Or any callable function that return features
            given a wave file.
        timefunc: callable. Function that returns timestamps for the
            aforementionned features. By default, it assume a window length
            of 25 ms and a window step of 10 ms.
        """
        if featfunc is None:
            featfunc = self.do_fbank
        batch_size = 500
        features = []
        times = []
        internal_files = []
        i = 0
        for f in files:
            if i == batch_size:
                h5features.write(h5f, '/features/', internal_files, times,
                                 features)
                features = []
                times = []
                internal_files = []
                i = 0
            i = i + 1
            data = featfunc(f)
            features.append(data)
            if timefunc is None:
                time = np.arange(data.shape[0], dtype=float) * 0.01 + 0.0025
            else:
                time = timefunc(f)
            times.append(time)
            internal_files.append(os.path.basename(os.path.splitext(f)[0]))
        if features:
            h5features.write(h5f, '/features/', internal_files, times,
                             features)
Пример #18
0
def npz_to_h5features(path, files, h5_filename, h5_groupname, batch_size=500):
    features = []
    times = []
    internal_files = []
    i = 0
    for f in files:
        if i == batch_size:
            h5features.write(h5_filename, h5_groupname, internal_files, times,
                             features)
            features = []
            times = []
            internal_files = []
            i = 0
        i = i + 1
        data = np.load(os.path.join(path, f))
        features.append(data['features'])
        times.append(data['time'])
        internal_files.append(os.path.splitext(f)[0])
    if features:
        h5features.write(h5_filename, h5_groupname, internal_files, times,
                         features)
Пример #19
0
def segment_features(features_file, segments_file, out_file):
    """
    Segment h5features file containing features for whole wavefiles
    of an abkhazia corpus (or split of a corpus) into features for
    segments as described in the provided segments.txt file.
    """
    utt_ids, wavefiles, starts, stops = io.read_segments(segments_file)
    if all([e is None for e in starts]) and all([e is None for e in stops]):
        # TODO use a log instead of a print statement
        print(
            "segment_features: segments already match wavefiles, "
            "doing nothing...")
    else:
        # Group utterances by wavefiles
        data = zip(utt_ids, wavefiles, starts, stops)
        for wav, utts in groupby(data, lambda e: e[1]):
            # TODO use a log instead of a print statement
            print "Segmenting features for file {} by utterance".format(wav)
            # load features for whole wavefile
            wav_id = os.path.splitext(wav)[0]
            # TODO fix that
            times, features = h5features.read(features_file,
                                              from_internal_file=wav_id)
            # no need for dict here
            times, features = times[wav_id], features[wav_id]

            utt_ids, utt_times, utt_features = [], [], []
            for utt_id, _, start, stop in utts:
                # select features for appropriate segment
                utt_ids.append(utt_id)
                indices = np.where(
                    np.logical_and(times >= start, times <= stop))[0]

                # get times relative to beginning of utterance
                utt_times.append(times[indices] - start)
                utt_features.append(features[indices, :])

            # write to out_file once for each wavefile
            h5features.write(out_file, 'features', utt_ids, utt_times,
                             utt_features)
Пример #20
0
def h5features_compute(files, h5f, featfunc=do_fbank, timefunc=None):
    """Compute mfcc or filterbanks (or other) in h5features format.

    Parameters:
    ----------
    files: list, list of files on which to compute the features. You must
        give the complete relative or absolute path of the wave file
    h5f: str. Name of the h5features file to create.
    featfunc: callable. "do_fbanks" to compute fbanks, "do_mfccs" to compute
        mfccs. Or any callable function that return features given a wave file.
    timefunc: callable. Function that returns timestamps for the aforementionned
        features. By default, it assume a window length of 25 ms and a window
        step of 10 ms.
    """
    batch_size = 500
    features = []
    times = []
    internal_files = []
    i = 0
    for f in files:
        if i == batch_size:
            h5features.write(h5f, '/features/', internal_files, times,
                             features)
            features = []
            times = []
            internal_files = []
            i = 0
        i = i+1
        data = featfunc(f)
        features.append(data)
        if timefunc == None:
            time = np.arange(data.shape[0], dtype=float) * 0.01 + 0.0025
        else:
            time = timefunc(f)
        times.append(time)
        internal_files.append(os.path.basename(os.path.splitext(f)[0]))
    if features:
        h5features.write(h5f, '/features/',
                         internal_files, times,
                         features)
Пример #21
0
def features2features(in_file, out_file):
    """
    kaldi input features (mfcc, etc.) to h5features
    this loads everything into memory, but it would be easy to write
    an incremental version if this poses a problem
    Input features must be in a single archive text format, that can be
    obtained using the 'copy-feats' kaldi utility
    """
    # below is basically a parser for kaldi vector format for each line
    # parse input text file
    outside_utt = True
    features = []
    utt_ids = []
    times = []
    with codecs.open(in_file, mode='r', encoding='UTF-8') as inp:
        for index, line in enumerate(inp):
            print("Processing line {0}".format(index + 1)
                  )  # / {1}".format(index+1, len(lines)))
            tokens = line.strip().split(u" ")
            if outside_utt:
                assert len(
                    tokens) == 3 and tokens[1] == u"" and tokens[2] == u"["
                utt_id = tokens[0]
                outside_utt = False
                frames = []
            else:
                if tokens[-1] == u"]":
                    # end of utterance
                    outside_utt = True
                    tokens = tokens[:-1]
                frames.append(np.array(tokens, dtype=np.float))
                if outside_utt:
                    # end of utterance, continued
                    features.append(np.row_stack(frames))
                    # as in kaldi2abkhazia, this is ad hoc and has not been checked formally
                    times.append(0.0125 + 0.01 * np.arange(len(frames)))
                    utt_ids.append(utt_id)
    h5features.write(out_file, 'features', utt_ids, times, features)
def h5features_from_nparray(input_path, h5f, timefunc=None):
    """Compute speech features (such as posteriogram) that are in numpy array 
    in h5features format.

    Parameters:
    ----------
    input_path: path of the directory containing the features of audio files in numpy array
    h5f: str. Name of the h5features file to create.
    timefunc: callable. Function that returns timestamps for the aforementionned
        features. By default, it assume a window length of 25 ms and a window
        step of 10 ms.
        
    """
    files = [f for f in listdir(input_path) if isfile(join(input_path, f))]
    batch_size = 500
    features = []
    times = []
    internal_files = []
    i = 0
    for f in files:
        if i == batch_size:
            h5features.write(h5f, '/features/', internal_files, times,
                             features)
            features = []
            times = []
            internal_files = []
            i = 0
        i = i + 1
        features.append(f)
        if timefunc == None:
            time = np.arange(f.shape[0], dtype=float) * 0.01 + 0.0025
        else:
            time = timefunc(f)
        times.append(time)
        internal_files.append(os.path.basename(os.path.splitext(f)[0]))
    if features:
        h5features.write(h5f, '/features/', internal_files, times, features)
Пример #23
0
 def setup(self):
     items, self.data, feats = generate.full(10, tformat=1)
     self.filename = 'test.h5'
     self.teardown()
     write(self.filename, 'group', items, self.data, feats)
     self.group = h5py.File(self.filename, 'a')['group']
Пример #24
0
 def setup(self):
     items, self.data, feats = generate.full(10,tformat=1)
     self.filename = 'test.h5'
     self.teardown()
     write(self.filename, 'group', items, self.data, feats)
     self.group = h5py.File(self.filename, 'a')['group']
Пример #25
0
    def test_normalization_with_VAD(self):
        # paths
        tempdir = Path(tempfile.mkdtemp())
        h5f = str(tempdir / 'h5.features')
        vad_file = str(tempdir / 'vad')

        # write VAD data for file 1
        with open(vad_file, 'w') as vad1:
            vad1.write("file,start,stop\n"
                       "file1,0.0025,0.5000\n"
                       "file1,0.7525,1.000\n")

        items = ['file1', 'file2']

        # generate data
        feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        features = [feature1, feature2]
        times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025]
        times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025)
        h5features.write(h5f, '/features/', items, times, features)

        h5f_mean_var = str(tempdir / 'h5-normalized.features')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=True)
        mean, var = features_generator.mean_variance_normalisation(
            h5f, h5f_mean_var, vad_file=vad_file)

        assert mean == pytest.approx(
            np.mean(np.vstack([feature1[:75], feature2]), axis=0))
        assert var == pytest.approx(
            np.std(np.vstack([feature1[:75], feature2]), axis=0))

        reader = h5features.Reader(h5f_mean_var)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - mean) / var)

        assert data.dict_features()['file2'] == pytest.approx(
            (feature2 - mean) / var)

        ## test no per channel
        tmp2 = str(tempdir / 'tmp2.h5')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=False)
        mean, var = features_generator.mean_variance_normalisation(
            h5f, tmp2, vad_file=vad_file)

        assert mean == pytest.approx\
            (np.mean(np.vstack([feature1[:75], feature2])))
        assert var == pytest.approx(
            np.std(np.vstack([feature1[:75], feature2])))

        reader = h5features.Reader(tmp2)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - mean) / var)

        assert data.dict_features()['file2'] == pytest.approx(
            (feature2 - mean) / var)

        shutil.rmtree(str(tempdir))
Пример #26
0
    def test_norm_per_file_with_VAD(self):

        # paths
        tempdir = Path(tempfile.mkdtemp())
        h5f = str(tempdir / 'h5.features')
        vad_path = str(tempdir / 'vad')

        # write VAD data for file 1
        with open(str(vad_path), 'w') as vad1:
            vad1.write("file,start,stop\n"
                       "file1,0.0025,0.5000\n"
                       "file1,0.7525,1.000\n")

        items = ['file1', 'file2']

        # generate data
        feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        features = [feature1, feature2]
        times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025]
        times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025)
        h5features.write(h5f, '/features/', items, times, features)

        h5f_mean_var = str(tempdir / 'h5-normalized.features')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=True)
        meansvars = features_generator.mean_var_norm_per_file(
            h5f, h5f_mean_var, vad_file=str(vad_path))

        assert meansvars[0][0] == 'file1'

        assert all(meansvars[0][1] == np.mean(feature1[:75], axis=0))
        assert all(meansvars[0][2] == np.std(feature1[:75], axis=0))

        assert meansvars[1][0] == 'file2'
        assert all(meansvars[1][1] == np.mean(feature2, axis=0))
        assert all(meansvars[1][2] == np.std(feature2, axis=0))

        reader = h5features.Reader(h5f_mean_var)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75]))

        assert np.mean(data.dict_features()['file2']) == pytest.approx(0)
        assert np.std(data.dict_features()['file2']) == pytest.approx(1)

        # test no per channel
        features_generator = FeaturesGenerator(
            normalization=True,
            norm_per_file=True,
            norm_per_channel=False,
        )
        tmp2 = str(tempdir / 'tmp2.h5')
        meansvars = features_generator.mean_var_norm_per_file(
            h5f, tmp2, vad_file=str(vad_path))

        assert meansvars == [
            ('file1', np.mean(feature1[:75]), np.std(feature1[:75])),
            ('file2', np.mean(feature2), np.std(feature2)),
        ]

        reader = h5features.Reader(tmp2)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75]))

        assert np.mean(data.dict_features()['file2']) == pytest.approx(0)
        assert np.std(data.dict_features()['file2']) == pytest.approx(1)
        shutil.rmtree(str(tempdir))
Пример #27
0
def lattice2features(phones_file,
                     post_file,
                     out_file,
                     out_phones_file,
                     word_position_dependent=True):
    """
    kaldi lattice posteriors to h5features
    this loads everything into memory, but it would be easy to write
    an incremental version if this poses a problem
    """
    phonemap = read_kaldi_phonemap(phones_file, word_position_dependent)
    # get order in which phones will be represented in the dimensions of the posteriorgram
    phone_order = get_phone_order(phonemap)

    d = len(phone_order)  # posteriorgram dimension
    # below is basically a parser for kaldi matrix format for each line
    # parse input text file
    with codecs.open(post_file, mode='r', encoding='UTF-8') as inp:
        lines = inp.readlines(
        )  # xreadlines supposed to be more efficient for large files?
    # here would be nice to use sparse feature format (need to have it in h5features though)
    # might want to begin by using sparse numpy matrix format
    features = []
    utt_ids = []
    times = []
    for index, line in enumerate(lines):
        print("Processing line {0} / {1}".format(index + 1, len(lines)))
        tokens = line.strip().split(u" ")
        utt_id, tokens = tokens[0], tokens[1:]
        frames = []
        inside = False
        for token in tokens:
            if token == u"[":
                assert not (inside)
                inside = True
                frame = []
            elif token == u"]":
                assert inside
                inside = False
                frames.append(frame)
            else:
                assert inside
                frame.append(token)
        utt_features = np.zeros(shape=(len(frames), d), dtype=np.float64)
        for f, frame in enumerate(frames):
            assert len(frame) % 2 == 0
            probas = [float(p) for p in frame[1::2]]
            phones = [phonemap[code] for code in frame[::2]]
            # optimisation 1 would be mapping directly a given code to a given posterior dim
            for phone, proba in zip(phones, probas):
                i = phone_order.index(phone)
                # add to previous proba since different variants of a same phone will map to
                # the same dimension i of the posteriorgram
                utt_features[f, i] = utt_features[f, i] + proba
        # normalize posteriorgrams to correct for rounding or thresholding errors
        # by rescaling globally
        total_proba = np.sum(utt_features, axis=1)
        if np.max(np.abs(total_proba -
                         1)) >= 1e-5:  # ad hoc numerical tolerance...
            raise IOError(
                "In utterance {0}, frame {1}: posteriorgram does not sum to one, difference is {2}: "
                .format(utt_id, f, np.max(np.abs(total_proba - 1))))
        utt_features = utt_features / np.tile(total_proba, (d, 1)).T
        features.append(utt_features)
        utt_ids.append(utt_id)
        # as in kaldi2abkhazia, this is ad hoc and has not been checked formally
        times.append(0.0125 + 0.01 * np.arange(len(frames)))
    h5features.write(out_file, 'features', utt_ids, times, features)
    o_ph = codecs.open(out_phones_file, encoding="utf-8", mode="w")
    for ph in phone_order:
        o_ph.write(ph + u'\n')