Exemplo n.º 1
0
def generate_data(nitem, nfeat=2, dim=10, labeldim=1, base='item'):
    """Returns a randomly generated h5f.Data instance.

    - nitem is the number of items to generate.
    - nfeat is the number of features to generate for each item.
    - dim is the dimension of the features vectors.
    - base is the items basename
    - labeldim is the dimension of the labels vectors.
    """
    import numpy as np

    # A list of item names
    items = [base + '_' + str(i) for i in range(nitem)]

    # A list of features arrays
    features = [np.random.randn(nfeat, dim) for _ in range(nitem)]

    # A list on 1D or 2D times arrays
    if labeldim == 1:
        labels = [np.linspace(0, 1, nfeat)] * nitem
    else:
        t = np.linspace(0, 1, nfeat)
        labels = [np.array([t + i for i in range(labeldim)])] * nitem

    # Format data as required by the writer
    return h5f.Data(items, labels, features, check=True)
Exemplo n.º 2
0
    def setup(self):
        self.filename = 'test.h5'
        self.groupname = 'group'
        self.nitems = 10
        d = generate.full(self.nitems)
        self.data = h5f.Data(d[0], d[1], d[2])

        h5f.Writer(self.filename).write(self.data, self.groupname)
Exemplo n.º 3
0
def generate_utterance_item(input_folder,
                            output_name,
                            include_ivector=True,
                            utt2spk_file=None,
                            include_matlab=False):
    """
    """
    os.chdir(input_folder)
    ark_filenames = []
    for filename in os.listdir('.'):
        if filename.endswith(".ark") and filename.startswith("ivector"):
            ark_filenames.append(filename)

    print(str(len(ark_filenames)) + ' ivector .ark files found.')
    utt2spk_file = None
    if utt2spk_file != None:
        with codecs.open(utt2spk_file, mode='r', encoding='UTF-8') as inp:
            lines = inp.read().splitlines()
        utt2spk = {}
        for l in lines:
            u = l.split(None, 1)
            utt2spk[u[0]] = u[1]
        print('utt2spk file loaded')

    utts = []
    times = []
    feats = []
    spks = []
    for f in ark_filenames:
        print('loading ' + f)

        with codecs.open(f, mode='r', encoding='UTF-8') as inp:
            lines = inp.read().splitlines()
            for l in lines:
                u = l.strip().split(None, 1)
                vector = u[1].split()
                utts.append(u[0])
                if utt2spk_file != None:
                    spks.append(utt2spk[u[0]])
                assert (
                    vector.pop(0) == '['), 'start of vector marker not found'
                assert (
                    vector.pop(-1) == ']'), 'end of vector marker not found'
                feats.append(np.array([vector]).astype('float'))
                times.append(np.array([0.1]))

    if include_ivector:
        with h5f.Writer(output_name + '_vectors.ivector') as writer:
            data = h5f.Data(utts, times, feats, check=True)
            writer.write(data, 'features')

    if include_matlab:
        sio.savemat(
            output_name + '_vectors.mat', {
                output_name + '_vectors': feats,
                output_name + '_utts': utts,
                output_name + '_spks': spks
            })
Exemplo n.º 4
0
def _ark_to_data(arkfile, sample_frequency=100, tstart=0.0125):
    """ark to h5features.Data"""
    d = ark_to_dict(arkfile)

    times = [
        np.arange(val.shape[0], dtype=float) / sample_frequency + tstart
        for val in d.values()
    ]

    return h5f.Data(list(d.keys()), times, list(d.values()))
Exemplo n.º 5
0
    def embed(self):
        """
        Embed method to embed features based on a saved network
        """

        if self.network_path is not None:
            self.network.load_network(self.network_path)
        self.network.eval()

        if self.cuda:
            self.network.cuda()

        items = None
        times = None
        features_list = []
        for path in self.feature_path:
            with h5features.Reader(path, 'features') as fh:
                features = fh.read()
                features_list.append(features.features())
                check_items = features.items()
                check_times = features.labels()
            if not items:
                items = check_items
            if not times:
                times = check_times

        print("Done loading input feature file")

        zipped_feats = zip(*features_list)
        embeddings = []
        for feats in zipped_feats:
            modes_list = []
            for feat in feats:
                if feat.dtype != np.float32:
                    feat = feat.astype(np.float32)
                feat_torch = Variable(torch.from_numpy(feat), volatile=True)
                if self.cuda:
                    feat_torch = feat_torch.cuda()
                modes_list.append(feat_torch)
            emb, _ = self.network(modes_list, modes_list)
            emb = emb.cpu()
            embeddings.append(emb.data.numpy())

            #Register activity on observer
            for observer in self.observers:
                observer.register_status()

        data = h5features.Data(items, times, embeddings, check=True)
        with h5features.Writer(self.output_path + "embedded.features") as fh:
            fh.write(data, 'features')

        #Save observer registers
        for observer in self.observers:
            observer.save(items, times)
Exemplo n.º 6
0
def test_from_exemple(tmpdir):
    filename = os.path.join(str(tmpdir), 'exemple.h5')
    a1, a2, a3 = generate.full(100)
    data = h5f.Data(a1, a2, a3)

    h5f.Writer(filename).write(data, 'group')

    with h5f.Reader(filename, 'group') as r:
        rdata = r.read()
        assert len(rdata.items()) == 100
        assert data == rdata
Exemplo n.º 7
0
    def _save(self, features, with_properties, compress=True):
        self._log.info('writing %s', self.filename)

        # we safely use append mode as we are sure at this point the
        # file does not exist (from FeaturesSerializer.save)
        with h5features.Writer(
                self.filename,
                mode='a',
                chunk_size='auto',
                compression='lzf' if compress else None) as writer:
            # append the feature in the file one by one (this avoid to
            # duplicate the whole collection in memory, which can
            # cause MemoryError on big datasets).
            for k, v in features.items():
                if with_properties:
                    data = h5features.Data([k], [v.times], [v.data],
                                           properties=[v.properties])
                else:
                    data = h5features.Data([k], [v.times], [v.data])
                writer.write(data, groupname='features', append=True)
Exemplo n.º 8
0
    def save(self, items, times):
        '''
        Save the internal responses

        :param path:    path to save the internal responses
        :param items:   same items used to save the embeddings
        :param times:   same times used to save embeddings

        '''
        data = h5features.Data(items, times, self.intern_responses, check=True)
        with h5features.Writer(self.path) as fh:
            fh.write(data, 'features')
Exemplo n.º 9
0
    def embed(self):
        """ Embed method to embed features based on a saved network

        """
        if self.network_path is not None:
            self.network.load_network(self.network_path)
        self.network.eval()

        if self.cuda:
            self.network.cuda()

        with h5features.Reader(self.feature_path, 'features') as fh:
            features = fh.read()

        items = features.items()
        times = features.labels()
        feats = features.features()

        embeddings_spk, embeddings_phn = [], []
        for feat in feats:
            if feat.dtype != np.float32:
                feat = feat.astype(np.float32)
            feat_torch = Variable(torch.from_numpy(feat), volatile=True)
            if self.cuda:
                feat_torch = feat_torch.cuda()
            emb_spk, emb_phn, _, _ = self.network(feat_torch, feat_torch)
            emb_spk = emb_spk.cpu()
            emb_phn = emb_phn.cpu()
            embeddings_spk.append(emb_spk.data.numpy())
            embeddings_phn.append(emb_phn.data.numpy())

        data_spk = h5features.Data(items, times, embeddings_spk, check=True)
        data_phn = h5features.Data(items, times, embeddings_phn, check=True)

        with h5features.Writer(self.output_path+'.spk') as fh:
            fh.write(data_spk, 'features')

        with h5features.Writer(self.output_path+'.phn') as fh:
            fh.write(data_phn, 'features')
Exemplo n.º 10
0
    def extract_h5_features(audio_features=None,
                            ema_features=None,
                            inverter=None,
                            output_name='%s_features' % corpus,
                            articulators=None,
                            dynamic_ema=True,
                            sampling_rate=100):
        """Build an h5 file recording audio features associated with {0} data.

        audio_features : optional name of audio features to use, including
                         normalization indications
        ema_features   : optional name of ema features' normalization to use
                         (use '' for raw data and None for no EMA data)
        inverter       : optional acoustic-articulatory inverter whose
                         predictions to use, based on the audio features
        output_name    : base name of the output file (default '{0}_features')
        articulators   : optional list of articulators to keep among EMA data
        dynamic_ema    : whether to include dynamic articulatory features
                         (bool, default True)
        sampling_rate  : sampling rate of the frames, in Hz (int, default 100)
        """
        # Arguments serve modularity; pylint: disable=too-many-arguments
        nonlocal abx_folder, get_utterances, _setup_features_loader
        # Build the abx folder, if necessary.
        if not os.path.isdir(abx_folder):
            os.makedirs(abx_folder)
        # Check that the destination file does not exist.
        output_file = os.path.join(abx_folder, '%s.features' % output_name)
        if os.path.isfile(output_file):
            raise FileExistsError("File '%s' already exists." % output_file)
        # Set up the features loading function.
        load_features = _setup_features_loader(audio_features, ema_features,
                                               inverter, dynamic_ema,
                                               articulators)
        # Load the list of utterances and process them iteratively.
        utterances = get_utterances()
        with h5f.Writer(output_file) as writer:
            for i in range(0, len(utterances), 100):
                # Load or compute utterances list, features and time labels.
                items = utterances[i:i + 100]
                features = [load_features(item) for item in items]
                labels = [
                    np.arange(len(data)) / sampling_rate for data in features
                ]
                # Write the currently processed utterances' data to h5.
                writer.write(h5f.Data(items, labels, features, check=True),
                             groupname='features',
                             append=True)
Exemplo n.º 11
0
    def embed(self):
        """ Embed method to embed features based on a saved network

        """
        if self.network_path is not None:
            self.network.load_network(self.network_path)
        self.network.eval()

        if self.cuda:
            self.network.cuda()
        print("Done loading network weights")

        with h5features.Reader(self.feature_path, 'features') as fh:
            features = fh.read()

        items = features.items()
        times = features.labels()
        feats = features.features()
        print("Done loading input feature file")

        embeddings = []
        for feat in feats:
            if feat.dtype != np.float32:
                feat = feat.astype(np.float32)
            n_batches = len(feat) // self.batch_size + 1
            batches_feat = np.array_split(feat, n_batches)
            outputs = []
            for b_feat in batches_feat:
                feat_torch = Variable(torch.from_numpy(b_feat), volatile=True)
                if self.cuda:
                    feat_torch = feat_torch.cuda()
                emb, _ = self.network(feat_torch, feat_torch)
                emb = emb.cpu()
                outputs.append(emb.data.numpy())
            outputs = np.vstack(outputs)
            embeddings.append(outputs)

        data = h5features.Data(items, times, embeddings, check=True)
        with h5features.Writer(self.output_path) as fh:
            fh.write(data, 'features')