示例#1
0
    def test_normalization_per_file(self):

        tempdir = Path(tempfile.mkdtemp())
        h5f = str(tempdir / 'h5.features')

        feature1 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), -1.)])
        feature2 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), 2.)])
        features = [feature1, feature2]
        items = ['file1', 'file2']
        times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025]
        times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025)

        h5features.write(h5f, '/features/', items, times, features)

        h5f_mean_var = str(tempdir / 'h5-normalized.features')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True)
        meansvars = features_generator.mean_var_norm_per_file(
            h5f, h5f_mean_var)

        assert meansvars[0][0] == 'file1'
        assert all(meansvars[0][1] == np.mean(feature1, axis=0))
        assert all(meansvars[0][2] == np.std(feature1, axis=0))

        assert meansvars[1][0] == 'file2'
        assert all(meansvars[1][1] == np.mean(feature2, axis=0))
        assert all(meansvars[1][2] == np.std(feature2, axis=0))

        reader = h5features.Reader(h5f_mean_var)
        data = reader.read()
        for file in data.items():
            assert np.mean(data.dict_features()[file]) == pytest.approx(0)
            assert np.std(data.dict_features()[file]) == pytest.approx(1)

        # no per channel
        tmp2 = str(tempdir / 'h5-tmp2')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=False)
        meansvars = features_generator.mean_var_norm_per_file(h5f, tmp2)

        assert meansvars == [
            ('file1', 0, np.std(feature1)),
            ('file2', 1.5, np.std(feature2)),
        ]

        reader = h5features.Reader(tmp2)
        data = reader.read()
        for file in data.items():
            assert np.mean(data.dict_features()[file]) == pytest.approx(0)
            assert np.std(data.dict_features()[file]) == pytest.approx(1)

        shutil.rmtree(str(tempdir))
示例#2
0
def test_rw_one_frame_2D(tmpdir):
    h5file = os.path.join(str(tmpdir), 'exemple.h5')
    gold = generate.full_data(1, 3, 1, 2)

    h5f.Writer(h5file).write(gold)
    test = h5f.Reader(h5file).read()
    assert test == gold
示例#3
0
文件: utils.py 项目: yijiuzai/abnet3
def read_feats(features_file, align_features_file=None):
    with h5features.Reader(features_file, 'features') as fh:
        features = fh.read()  # load all at once here...
    times = features.dict_labels()
    feats = features.dict_features()
    feat_dim = feats[list(feats.keys())[0]].shape[1]
    features = Features_Accessor(times, feats)
    if align_features_file is None:
        align_features = None
    else:
        with h5features.Reader(features_file, 'features') as fh:
            align_features = fh.read()  # load all at once here...
        times = align_features.dict_labels()
        feats = align_features.dict_features()
        align_features = Features_Accessor(times, feats)
    return features, align_features, feat_dim
示例#4
0
 def test_init_not_hdf(self):
     with tempfile.NamedTemporaryFile(delete=False) as temp:
         temp.write(b'This is not a HDF5 file')
     with pytest.raises(IOError) as err:
         h5f.Reader(temp.name, self.groupname)
     assert 'not a HDF5 file' in str(err.value)
     remove(temp.name)
示例#5
0
def test_read_tofromtimes(tmpdir, dim):
    filename = os.path.join(str(tmpdir), 'test.h5f')
    groupname = 'group'
    data = generate.full_data(1, dim, 300)
    h5f.Writer(filename, mode='w').write(data, groupname=groupname)

    data2 = h5f.Reader(filename, groupname).read()
    assert data == data2

    data3 = h5f.Reader(filename, groupname).read(from_time=0, to_time=1)
    assert data3 == data

    data4 = h5f.Reader(filename, groupname).read(from_time=0.4, to_time=0.5)
    #print data4.labels()
    assert data4.labels()[0][0] >= 0.4
    assert data4.labels()[0][-1] <= 0.5
示例#6
0
def test_write_mode(tmpdir, mode, append):
    h5file = os.path.join(str(tmpdir) + 'test.h5')
    data = generate.full_data(1, 3, 1, 2)
    copy_data = copy.deepcopy(data)

    h5f.Writer(h5file).write(data, append=False)
    assert h5f.Reader(h5file).read() == data

    # write data a second time
    if mode == 'a' and append is True:
        with pytest.raises(IOError):
            h5f.Writer(h5file, mode=mode).write(data, append=append)
    else:
        h5f.Writer(h5file, mode=mode).write(data, append=append)
        assert data == copy_data
        assert h5f.Reader(h5file).read() == copy_data
示例#7
0
    def mean_var_norm_per_file(self, h5f, mvn_h5f, vad_file=None):
        # normalize either per channel or on the whole spectrum.
        axis = 0 if self.norm_per_channel else None

        dset_name = list(h5py.File(h5f).keys())[0]
        files = h5py.File(h5f)[dset_name]['items']
        reader = h5features.Reader(h5f)
        means_vars = []
        for f in files:
            data = reader.read(from_item=f)
            items, features, times = (data.items(), data.features()[0],
                                      data.labels()[0])
            # VAD
            filtered_features = None
            if vad_file is not None:
                vad_data = read_vad_file(vad_file)
                if str(f) in vad_data:
                    filtered_features = self.filter_vad_one_file(
                        features, times, vad_data[str(f)])

            if filtered_features is None:
                mean = np.mean(features, axis=axis)
                std = np.std(features, axis=axis)
            else:
                mean = np.mean(filtered_features, axis=axis)
                std = np.std(filtered_features, axis=axis)
            features = (features - mean) / (std + np.finfo(features.dtype).eps)
            h5features.write(mvn_h5f, '/features/', items, [times], [features])
            means_vars.append((f, mean, std))
        return means_vars
示例#8
0
def test_features(pitch, ftype, corpus, tmpdir):
    output_dir = str(tmpdir.mkdir('feats'))
    flog = os.path.join(output_dir, 'feats.log')
    log = utils.logger.get_log(flog)

    # keep only 3 utterances for testing speed
    subcorpus = corpus.subcorpus(list(corpus.utts())[0:3])
    assert len(list(subcorpus.utts())) == 3

    # mfcc with few channels
    nbc = 3
    feat = features.Features(subcorpus, output_dir, log=log)
    feat.type = ftype
    feat.njobs = 1
    feat.use_pitch = pitch
    feat.delete_recipe = False
    feat.features_options.append(
        ('num-ceps' if ftype in ('mfcc', 'plp') else 'num-mel-bins', nbc))

    try:
        feat.compute()
    except RuntimeError as err:
        import sys
        sys.stdout.write(open(flog, 'r').read())
        sys.stdout.write(
            open(
                os.path.join(
                    output_dir, 'recipe',
                    'exp/make_mfcc/features/make_mfcc_pitch_features.1.log'),
                'r').read())
        sys.stdout.write(
            open(os.path.join(output_dir, 'recipe/conf/mfcc.conf'),
                 'r').read())
        raise err

    # # actually ERROR is in the vocabulary so this test fails...
    # assert_no_expr_in_log(flog, 'error')

    # basic asserts on files
    assert os.path.isfile(os.path.join(output_dir, 'meta.txt'))
    features.Features.check_features(output_dir)

    # convert to h5features and read it back
    h5 = os.path.join(output_dir, 'feats.h5')
    ark.scp_to_h5f(os.path.join(output_dir, 'feats.scp'), h5)
    data = h5features.Reader(h5, 'features').read()

    # check we have nbc or nbc+3 channels
    dim = data.features()[0].shape[1]
    exp = nbc + 3 if pitch else nbc
    assert dim == exp, 'bad dim: {}, expected {}'.format(dim, exp)

    # check utt_ids in h5f are consistent with corpus
    times = data.dict_labels()
    assert len(times.keys()) == len(subcorpus.utts())
    for t, c in zip(times.keys(), subcorpus.utts()):
        assert t == c
示例#9
0
    def test_const_on_write(self, tmpdir, mode, append):
        # A Data instance must not change before/after writing it to a
        # group
        h5file = os.path.join(str(tmpdir) + 'test.h5')

        # first write
        assert self.data.items() == self.items
        h5f.Writer(h5file, mode=mode).write(self.data, append=append)
        assert self.data.items() == self.items
        assert h5f.Reader(h5file).read() == self.data

        # second write of the same data
        if mode == 'a' and append is True:
            with pytest.raises(IOError):
                h5f.Writer(h5file, mode=mode).write(self.data, append=append)
        else:
            h5f.Writer(h5file, mode=mode).write(self.data, append=append)
            assert self.data.items() == self.items
            assert h5f.Reader(h5file).read() == self.data
示例#10
0
    def embed(self):
        """
        Embed method to embed features based on a saved network
        """

        if self.network_path is not None:
            self.network.load_network(self.network_path)
        self.network.eval()

        if self.cuda:
            self.network.cuda()

        items = None
        times = None
        features_list = []
        for path in self.feature_path:
            with h5features.Reader(path, 'features') as fh:
                features = fh.read()
                features_list.append(features.features())
                check_items = features.items()
                check_times = features.labels()
            if not items:
                items = check_items
            if not times:
                times = check_times

        print("Done loading input feature file")

        zipped_feats = zip(*features_list)
        embeddings = []
        for feats in zipped_feats:
            modes_list = []
            for feat in feats:
                if feat.dtype != np.float32:
                    feat = feat.astype(np.float32)
                feat_torch = Variable(torch.from_numpy(feat), volatile=True)
                if self.cuda:
                    feat_torch = feat_torch.cuda()
                modes_list.append(feat_torch)
            emb, _ = self.network(modes_list, modes_list)
            emb = emb.cpu()
            embeddings.append(emb.data.numpy())

            #Register activity on observer
            for observer in self.observers:
                observer.register_status()

        data = h5features.Data(items, times, embeddings, check=True)
        with h5features.Writer(self.output_path + "embedded.features") as fh:
            fh.write(data, 'features')

        #Save observer registers
        for observer in self.observers:
            observer.save(items, times)
示例#11
0
def test_from_exemple(tmpdir):
    filename = os.path.join(str(tmpdir), 'exemple.h5')
    a1, a2, a3 = generate.full(100)
    data = h5f.Data(a1, a2, a3)

    h5f.Writer(filename).write(data, 'group')

    with h5f.Reader(filename, 'group') as r:
        rdata = r.read()
        assert len(rdata.items()) == 100
        assert data == rdata
示例#12
0
def test_h5f_name_of_utterance(tmpdir, data, name):
    data = {name: data['test']}

    ark = os.path.join(str(tmpdir), 'ark')
    io.dict_to_ark(ark, data)

    # convert it to h5features file
    h5file = os.path.join(str(tmpdir), 'h5f')
    io.ark_to_h5f([ark], h5file)

    data2 = h5f.Reader(h5file).read()
    assert np.allclose(data2.dict_features()[name], data[name])
示例#13
0
    def _load(self, groupname='features'):
        self._log.info('loading %s', self.filename)

        data = h5features.Reader(self.filename, groupname=groupname).read()

        features = self._features_collection()
        for n in range(len(data.items())):
            features[data.items()[n]] = self._features(
                data.features()[n],
                data.labels()[n],
                properties=data.properties()[n],
                validate=False)
        return features
示例#14
0
def write_kl_to_column(distance_list, PG_file, root):
    """ Write distances into original table """

    hf5_file = root + PG_file
    times_r, features_r = h5f.read(hf5_file, 'features')
    items = h5f.Reader(hf5_file, 'features').items.data[0:]

    oth_x_array = np.array([])
    tgt_x_array = np.array([])

    for TRIP_NUM in range(1, 113):

        # select only item names which correspond to same triplet
        trip_id = 'triplet' + str('{0:03}'.format(TRIP_NUM))
        trip_items = [itm for itm in items if trip_id in itm]

        # trace the 01 = OTH, 02 = TGT, 03 = X
        item_oth = [oth for oth in trip_items if '_01' in oth][0]
        item_tgt = [tgt for tgt in trip_items if '_02' in tgt][0]
        item_x = [x for x in trip_items if '_03' in x][0]

        # find vectors
        feat_vector_oth = features_r[item_oth]
        feat_vector_tgt = features_r[item_tgt]
        feat_vector_x = features_r[item_x]
        # time_vector = times_r[item]

        # get KL divergence for TGT-X and OTH-X
        kl_oth_x = dtw_kl_divergence(feat_vector_oth, feat_vector_x)
        kl_tgt_x = dtw_kl_divergence(feat_vector_tgt, feat_vector_x)

        # put them into an array
        oth_x_array = np.append(oth_x_array, kl_oth_x)
        tgt_x_array = np.append(tgt_x_array, kl_tgt_x)

    name_othX = PG_file.split('.')[0] + '_oth_x'
    name_tgtX = PG_file.split('.')[0] + '_tgt_x'

    distance_list[name_othX] = pd.Series(oth_x_array, \
                                        index=distance_list.index)
    distance_list[name_tgtX] = pd.Series(tgt_x_array, \
                                        index=distance_list.index)

    return distance_list
示例#15
0
    def embed(self):
        """ Embed method to embed features based on a saved network

        """
        if self.network_path is not None:
            self.network.load_network(self.network_path)
        self.network.eval()

        if self.cuda:
            self.network.cuda()
        print("Done loading network weights")

        with h5features.Reader(self.feature_path, 'features') as fh:
            features = fh.read()

        items = features.items()
        times = features.labels()
        feats = features.features()
        print("Done loading input feature file")

        embeddings = []
        for feat in feats:
            if feat.dtype != np.float32:
                feat = feat.astype(np.float32)
            n_batches = len(feat) // self.batch_size + 1
            batches_feat = np.array_split(feat, n_batches)
            outputs = []
            for b_feat in batches_feat:
                feat_torch = Variable(torch.from_numpy(b_feat), volatile=True)
                if self.cuda:
                    feat_torch = feat_torch.cuda()
                emb, _ = self.network(feat_torch, feat_torch)
                emb = emb.cpu()
                outputs.append(emb.data.numpy())
            outputs = np.vstack(outputs)
            embeddings.append(outputs)

        data = h5features.Data(items, times, embeddings, check=True)
        with h5features.Writer(self.output_path) as fh:
            fh.write(data, 'features')
示例#16
0
    def embed(self):
        """ Embed method to embed features based on a saved network

        """
        if self.network_path is not None:
            self.network.load_network(self.network_path)
        self.network.eval()

        if self.cuda:
            self.network.cuda()

        with h5features.Reader(self.feature_path, 'features') as fh:
            features = fh.read()

        items = features.items()
        times = features.labels()
        feats = features.features()

        embeddings_spk, embeddings_phn = [], []
        for feat in feats:
            if feat.dtype != np.float32:
                feat = feat.astype(np.float32)
            feat_torch = Variable(torch.from_numpy(feat), volatile=True)
            if self.cuda:
                feat_torch = feat_torch.cuda()
            emb_spk, emb_phn, _, _ = self.network(feat_torch, feat_torch)
            emb_spk = emb_spk.cpu()
            emb_phn = emb_phn.cpu()
            embeddings_spk.append(emb_spk.data.numpy())
            embeddings_phn.append(emb_phn.data.numpy())

        data_spk = h5features.Data(items, times, embeddings_spk, check=True)
        data_phn = h5features.Data(items, times, embeddings_phn, check=True)

        with h5features.Writer(self.output_path+'.spk') as fh:
            fh.write(data_spk, 'features')

        with h5features.Writer(self.output_path+'.phn') as fh:
            fh.write(data_phn, 'features')
示例#17
0
def test_h5f_twice(tmpdir, data):
    # write the array as an ark file
    ark = os.path.join(str(tmpdir), 'ark')
    ark2 = os.path.join(str(tmpdir), 'ark2')
    io.dict_to_ark(ark, data)
    io.dict_to_ark(ark2, {k+'_2': v for k, v in data.items()})

    # convert it to h5features file
    h5file = os.path.join(str(tmpdir), 'h5f')
    io.ark_to_h5f([ark, ark2], h5file, 'test')

    # get back data from h5f
    data2 = h5f.Reader(h5file, 'test').read()
    assert data2.items() == ['test', 'test2', 'test2_2', 'test_2']
    assert data2.dict_labels()['test'].shape[0] == data['test'].shape[0]
    assert data['test'].shape == data2.dict_features()['test'].shape
    assert data['test'].shape == data2.dict_features()['test2_2'].shape
    assert np.allclose(data2.dict_features()['test'], data['test'])
    assert np.allclose(data2.dict_features()['test2_2'], data['test2'])

    # test writing in an existing group
    with pytest.raises(AssertionError):
        io.ark_to_h5f([ark], h5file, 'test')
示例#18
0
    data2 = generate_data(10, base='item2')
    writer.write(data2, 'group2', append=True)

    # If append is not True, existing data in the group is overwrited.
    data3 = generate_data(10, base='item3')
    writer.write(data3, 'group2', append=True)  # 120 items
    writer.write(data3, 'group2')  # 10 items

##########################
# Reading data from a file
##########################

# Initialize a reader and load the entire group. A notable difference
# with the Writer is that a Reader is attached to a specific group of
# a file. This allows optimized read operations.
rdata = h5f.Reader('exemple.h5', 'group1').read()

# Hopefully we read the same data we just wrote
assert rdata == data

# Some more advance reading facilities
with h5f.Reader('exemple.h5', 'group1') as reader:
    # Same as before, read the whole data
    whole_data = reader.read()

    # Read the first item stored on the group.
    first_item = reader.items.data[0]
    rdata = reader.read(first_item)
    assert len(rdata.items()) == 1

    # Read an interval composed of the 10 first items.
示例#19
0
    def test_normalization_with_VAD(self):
        # paths
        tempdir = Path(tempfile.mkdtemp())
        h5f = str(tempdir / 'h5.features')
        vad_file = str(tempdir / 'vad')

        # write VAD data for file 1
        with open(vad_file, 'w') as vad1:
            vad1.write("file,start,stop\n"
                       "file1,0.0025,0.5000\n"
                       "file1,0.7525,1.000\n")

        items = ['file1', 'file2']

        # generate data
        feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        features = [feature1, feature2]
        times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025]
        times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025)
        h5features.write(h5f, '/features/', items, times, features)

        h5f_mean_var = str(tempdir / 'h5-normalized.features')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=True)
        mean, var = features_generator.mean_variance_normalisation(
            h5f, h5f_mean_var, vad_file=vad_file)

        assert mean == pytest.approx(
            np.mean(np.vstack([feature1[:75], feature2]), axis=0))
        assert var == pytest.approx(
            np.std(np.vstack([feature1[:75], feature2]), axis=0))

        reader = h5features.Reader(h5f_mean_var)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - mean) / var)

        assert data.dict_features()['file2'] == pytest.approx(
            (feature2 - mean) / var)

        ## test no per channel
        tmp2 = str(tempdir / 'tmp2.h5')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=False)
        mean, var = features_generator.mean_variance_normalisation(
            h5f, tmp2, vad_file=vad_file)

        assert mean == pytest.approx\
            (np.mean(np.vstack([feature1[:75], feature2])))
        assert var == pytest.approx(
            np.std(np.vstack([feature1[:75], feature2])))

        reader = h5features.Reader(tmp2)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - mean) / var)

        assert data.dict_features()['file2'] == pytest.approx(
            (feature2 - mean) / var)

        shutil.rmtree(str(tempdir))
示例#20
0
    def test_norm_per_file_with_VAD(self):

        # paths
        tempdir = Path(tempfile.mkdtemp())
        h5f = str(tempdir / 'h5.features')
        vad_path = str(tempdir / 'vad')

        # write VAD data for file 1
        with open(str(vad_path), 'w') as vad1:
            vad1.write("file,start,stop\n"
                       "file1,0.0025,0.5000\n"
                       "file1,0.7525,1.000\n")

        items = ['file1', 'file2']

        # generate data
        feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)])
        features = [feature1, feature2]
        times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025]
        times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025)
        h5features.write(h5f, '/features/', items, times, features)

        h5f_mean_var = str(tempdir / 'h5-normalized.features')
        features_generator = FeaturesGenerator(normalization=True,
                                               norm_per_file=True,
                                               norm_per_channel=True)
        meansvars = features_generator.mean_var_norm_per_file(
            h5f, h5f_mean_var, vad_file=str(vad_path))

        assert meansvars[0][0] == 'file1'

        assert all(meansvars[0][1] == np.mean(feature1[:75], axis=0))
        assert all(meansvars[0][2] == np.std(feature1[:75], axis=0))

        assert meansvars[1][0] == 'file2'
        assert all(meansvars[1][1] == np.mean(feature2, axis=0))
        assert all(meansvars[1][2] == np.std(feature2, axis=0))

        reader = h5features.Reader(h5f_mean_var)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75]))

        assert np.mean(data.dict_features()['file2']) == pytest.approx(0)
        assert np.std(data.dict_features()['file2']) == pytest.approx(1)

        # test no per channel
        features_generator = FeaturesGenerator(
            normalization=True,
            norm_per_file=True,
            norm_per_channel=False,
        )
        tmp2 = str(tempdir / 'tmp2.h5')
        meansvars = features_generator.mean_var_norm_per_file(
            h5f, tmp2, vad_file=str(vad_path))

        assert meansvars == [
            ('file1', np.mean(feature1[:75]), np.std(feature1[:75])),
            ('file2', np.mean(feature2), np.std(feature2)),
        ]

        reader = h5features.Reader(tmp2)
        data = reader.read()
        assert data.dict_features()['file1'] == pytest.approx(
            (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75]))

        assert np.mean(data.dict_features()['file2']) == pytest.approx(0)
        assert np.std(data.dict_features()['file2']) == pytest.approx(1)
        shutil.rmtree(str(tempdir))
示例#21
0
 def test_read_time(self):
     reader = h5f.Reader(self.filename, self.groupname)
     assert reader.read(from_time=0, to_time=1) == reader.read()
示例#22
0
 def test_init_basic(self):
     reader = h5f.Reader(self.filename, self.groupname)
     assert reader.version == '1.1'
     assert reader.dformat == 'dense'
     assert len(reader.items.data) == self.nitems
示例#23
0
 def test_groupname_is_none(self):
     data = h5f.Reader(self.filename, None).read()
     assert self.data == data
示例#24
0
 def test_read_basic(self):
     data = h5f.Reader(self.filename, self.groupname).read()
     assert self.data == data
示例#25
0
 def test_init_not_group(self):
     with pytest.raises(IOError) as err:
         h5f.Reader(self.filename, self.groupname + 'spam')
     assert 'not a valid group' in str(err.value)
示例#26
0
 def test_init_not_file(self):
     with pytest.raises(IOError) as err:
         h5f.Reader(self.filename + 'spam', self.groupname)
     assert 'not a HDF5 file' in str(err.value)