def generate_data(nitem, nfeat=2, dim=10, labeldim=1, base='item'): """Returns a randomly generated h5f.Data instance. - nitem is the number of items to generate. - nfeat is the number of features to generate for each item. - dim is the dimension of the features vectors. - base is the items basename - labeldim is the dimension of the labels vectors. """ import numpy as np # A list of item names items = [base + '_' + str(i) for i in range(nitem)] # A list of features arrays features = [np.random.randn(nfeat, dim) for _ in range(nitem)] # A list on 1D or 2D times arrays if labeldim == 1: labels = [np.linspace(0, 1, nfeat)] * nitem else: t = np.linspace(0, 1, nfeat) labels = [np.array([t + i for i in range(labeldim)])] * nitem # Format data as required by the writer return h5f.Data(items, labels, features, check=True)
def setup(self): self.filename = 'test.h5' self.groupname = 'group' self.nitems = 10 d = generate.full(self.nitems) self.data = h5f.Data(d[0], d[1], d[2]) h5f.Writer(self.filename).write(self.data, self.groupname)
def generate_utterance_item(input_folder, output_name, include_ivector=True, utt2spk_file=None, include_matlab=False): """ """ os.chdir(input_folder) ark_filenames = [] for filename in os.listdir('.'): if filename.endswith(".ark") and filename.startswith("ivector"): ark_filenames.append(filename) print(str(len(ark_filenames)) + ' ivector .ark files found.') utt2spk_file = None if utt2spk_file != None: with codecs.open(utt2spk_file, mode='r', encoding='UTF-8') as inp: lines = inp.read().splitlines() utt2spk = {} for l in lines: u = l.split(None, 1) utt2spk[u[0]] = u[1] print('utt2spk file loaded') utts = [] times = [] feats = [] spks = [] for f in ark_filenames: print('loading ' + f) with codecs.open(f, mode='r', encoding='UTF-8') as inp: lines = inp.read().splitlines() for l in lines: u = l.strip().split(None, 1) vector = u[1].split() utts.append(u[0]) if utt2spk_file != None: spks.append(utt2spk[u[0]]) assert ( vector.pop(0) == '['), 'start of vector marker not found' assert ( vector.pop(-1) == ']'), 'end of vector marker not found' feats.append(np.array([vector]).astype('float')) times.append(np.array([0.1])) if include_ivector: with h5f.Writer(output_name + '_vectors.ivector') as writer: data = h5f.Data(utts, times, feats, check=True) writer.write(data, 'features') if include_matlab: sio.savemat( output_name + '_vectors.mat', { output_name + '_vectors': feats, output_name + '_utts': utts, output_name + '_spks': spks })
def _ark_to_data(arkfile, sample_frequency=100, tstart=0.0125): """ark to h5features.Data""" d = ark_to_dict(arkfile) times = [ np.arange(val.shape[0], dtype=float) / sample_frequency + tstart for val in d.values() ] return h5f.Data(list(d.keys()), times, list(d.values()))
def embed(self): """ Embed method to embed features based on a saved network """ if self.network_path is not None: self.network.load_network(self.network_path) self.network.eval() if self.cuda: self.network.cuda() items = None times = None features_list = [] for path in self.feature_path: with h5features.Reader(path, 'features') as fh: features = fh.read() features_list.append(features.features()) check_items = features.items() check_times = features.labels() if not items: items = check_items if not times: times = check_times print("Done loading input feature file") zipped_feats = zip(*features_list) embeddings = [] for feats in zipped_feats: modes_list = [] for feat in feats: if feat.dtype != np.float32: feat = feat.astype(np.float32) feat_torch = Variable(torch.from_numpy(feat), volatile=True) if self.cuda: feat_torch = feat_torch.cuda() modes_list.append(feat_torch) emb, _ = self.network(modes_list, modes_list) emb = emb.cpu() embeddings.append(emb.data.numpy()) #Register activity on observer for observer in self.observers: observer.register_status() data = h5features.Data(items, times, embeddings, check=True) with h5features.Writer(self.output_path + "embedded.features") as fh: fh.write(data, 'features') #Save observer registers for observer in self.observers: observer.save(items, times)
def test_from_exemple(tmpdir): filename = os.path.join(str(tmpdir), 'exemple.h5') a1, a2, a3 = generate.full(100) data = h5f.Data(a1, a2, a3) h5f.Writer(filename).write(data, 'group') with h5f.Reader(filename, 'group') as r: rdata = r.read() assert len(rdata.items()) == 100 assert data == rdata
def _save(self, features, with_properties, compress=True): self._log.info('writing %s', self.filename) # we safely use append mode as we are sure at this point the # file does not exist (from FeaturesSerializer.save) with h5features.Writer( self.filename, mode='a', chunk_size='auto', compression='lzf' if compress else None) as writer: # append the feature in the file one by one (this avoid to # duplicate the whole collection in memory, which can # cause MemoryError on big datasets). for k, v in features.items(): if with_properties: data = h5features.Data([k], [v.times], [v.data], properties=[v.properties]) else: data = h5features.Data([k], [v.times], [v.data]) writer.write(data, groupname='features', append=True)
def save(self, items, times): ''' Save the internal responses :param path: path to save the internal responses :param items: same items used to save the embeddings :param times: same times used to save embeddings ''' data = h5features.Data(items, times, self.intern_responses, check=True) with h5features.Writer(self.path) as fh: fh.write(data, 'features')
def embed(self): """ Embed method to embed features based on a saved network """ if self.network_path is not None: self.network.load_network(self.network_path) self.network.eval() if self.cuda: self.network.cuda() with h5features.Reader(self.feature_path, 'features') as fh: features = fh.read() items = features.items() times = features.labels() feats = features.features() embeddings_spk, embeddings_phn = [], [] for feat in feats: if feat.dtype != np.float32: feat = feat.astype(np.float32) feat_torch = Variable(torch.from_numpy(feat), volatile=True) if self.cuda: feat_torch = feat_torch.cuda() emb_spk, emb_phn, _, _ = self.network(feat_torch, feat_torch) emb_spk = emb_spk.cpu() emb_phn = emb_phn.cpu() embeddings_spk.append(emb_spk.data.numpy()) embeddings_phn.append(emb_phn.data.numpy()) data_spk = h5features.Data(items, times, embeddings_spk, check=True) data_phn = h5features.Data(items, times, embeddings_phn, check=True) with h5features.Writer(self.output_path+'.spk') as fh: fh.write(data_spk, 'features') with h5features.Writer(self.output_path+'.phn') as fh: fh.write(data_phn, 'features')
def extract_h5_features(audio_features=None, ema_features=None, inverter=None, output_name='%s_features' % corpus, articulators=None, dynamic_ema=True, sampling_rate=100): """Build an h5 file recording audio features associated with {0} data. audio_features : optional name of audio features to use, including normalization indications ema_features : optional name of ema features' normalization to use (use '' for raw data and None for no EMA data) inverter : optional acoustic-articulatory inverter whose predictions to use, based on the audio features output_name : base name of the output file (default '{0}_features') articulators : optional list of articulators to keep among EMA data dynamic_ema : whether to include dynamic articulatory features (bool, default True) sampling_rate : sampling rate of the frames, in Hz (int, default 100) """ # Arguments serve modularity; pylint: disable=too-many-arguments nonlocal abx_folder, get_utterances, _setup_features_loader # Build the abx folder, if necessary. if not os.path.isdir(abx_folder): os.makedirs(abx_folder) # Check that the destination file does not exist. output_file = os.path.join(abx_folder, '%s.features' % output_name) if os.path.isfile(output_file): raise FileExistsError("File '%s' already exists." % output_file) # Set up the features loading function. load_features = _setup_features_loader(audio_features, ema_features, inverter, dynamic_ema, articulators) # Load the list of utterances and process them iteratively. utterances = get_utterances() with h5f.Writer(output_file) as writer: for i in range(0, len(utterances), 100): # Load or compute utterances list, features and time labels. items = utterances[i:i + 100] features = [load_features(item) for item in items] labels = [ np.arange(len(data)) / sampling_rate for data in features ] # Write the currently processed utterances' data to h5. writer.write(h5f.Data(items, labels, features, check=True), groupname='features', append=True)
def embed(self): """ Embed method to embed features based on a saved network """ if self.network_path is not None: self.network.load_network(self.network_path) self.network.eval() if self.cuda: self.network.cuda() print("Done loading network weights") with h5features.Reader(self.feature_path, 'features') as fh: features = fh.read() items = features.items() times = features.labels() feats = features.features() print("Done loading input feature file") embeddings = [] for feat in feats: if feat.dtype != np.float32: feat = feat.astype(np.float32) n_batches = len(feat) // self.batch_size + 1 batches_feat = np.array_split(feat, n_batches) outputs = [] for b_feat in batches_feat: feat_torch = Variable(torch.from_numpy(b_feat), volatile=True) if self.cuda: feat_torch = feat_torch.cuda() emb, _ = self.network(feat_torch, feat_torch) emb = emb.cpu() outputs.append(emb.data.numpy()) outputs = np.vstack(outputs) embeddings.append(outputs) data = h5features.Data(items, times, embeddings, check=True) with h5features.Writer(self.output_path) as fh: fh.write(data, 'features')