def create_zmq_mux(streamers, num_cpus, active_streamers, streamer_rate, weights=None): num_streamers = len(streamers) if weights is None: weights = np.ones((num_streamers,)) weights = np.array(weights) # Normalize to sum to 1 weights = weights / weights.sum() partition_size = max(int(num_streamers / float(num_cpus)), 1) zmq_streamers = [] zmq_weights = [] actual_num_cpus = 0 for idx in range(num_cpus): start = partition_size * idx stop = min(len(streamers), start + partition_size) if start >= stop: break actual_num_cpus += 1 weight = sum(weights[start:stop]) zmq_weights.append(weight) sub_weights = np.array(weights[start:stop]) / weight zmq_streamers.append(pescador.ZMQStreamer(pescador.StochasticMux(streamers[start:stop], n_active=int(active_streamers * weight), rate=streamer_rate, weights=sub_weights))) return pescador.StochasticMux(zmq_streamers, n_active=actual_num_cpus, rate=None, weights=zmq_weights)
def _build_streamer(self, start_index: int, end_index: int) -> pescador.Streamer: """Create a pescador streamer for the provided indecies into the dataset.""" if (self.streamer_settings["n_frames"] is None or self.streamer_settings["n_target_frames"] is None): raise ValueError( "n_famres and n_target frames are currently required in the config " "for an Iterable dataset.") audiofile_streamers = [ _gen_frames( self.audioset_dataset, index, self.streamer_settings["n_frames"], self.streamer_settings["n_target_frames"], ) for index in range(start_index, end_index) ] if self.evaluate: audiofile_mux = pescador.RoundRobinMux(audiofile_streamers) else: audiofile_mux = pescador.StochasticMux( audiofile_streamers, # todo: eventually, this should probably be a function of # <batch size> & <# workers> # should probably be (batch_size / num_workers) n_active=self.streamer_settings["n_active"], # on average how many samples are generated from a stream before it dies rate=self.streamer_settings["rate"], ) return audiofile_mux
def lstm_data_generator(speech_list, noise_dir, srir_dir, sc_to_pos_dict, num_frames, num_frames_hop, fft_size, hop_size, sr, batch_size, active_streamers, rate, random_state=12345678): sc_list = get_sc_list(sc_to_pos_dict) azi_list, elv_list = zip(*sc_list) azi_list = np.array(list(azi_list)) elv_list = np.array(list(elv_list)) steer_mat = steer_vector(azi_list, elv_list) seeds = [] for speech_path in speech_list: if not speech_path.endswith('.wav'): continue streamer = pescador.Streamer(lstm_speech_mask_sampler, speech_path, noise_dir, srir_dir, sc_to_pos_dict, azi_list, elv_list, steer_mat, num_frames, num_frames_hop, fft_size, hop_size, sr) seeds.append(streamer) # Randomly shuffle the seeds random.shuffle(seeds) mux = pescador.StochasticMux(seeds, active_streamers, rate=rate, random_state=random_state) if batch_size == 1: return mux else: return pescador.maps.buffer_stream(mux, batch_size)
def __init__(self, source_filepath, seq_len=512, hop=None, normalize=True, transform=None, restart_streams=False): super(MusicDataset).__init__() source_folder = Path(source_filepath) self.seq_len = seq_len if hop == None: hop = seq_len self.hop = hop self.normalize = normalize self.transform = transform # get songs' path songs = [] for root, dirs, files in os.walk(source_folder): for name in files: songs.append(os.path.join(root, name)) # let's restrict to wav files (damn .DS_Store) songs = [song for song in songs if song.endswith('.wav')] # get songs length data = [] for song in songs: # get audio info song_info = torchaudio.info(song) data.append({ "path": song, "len": int(song_info[0].length / song_info[0].channels) }) self.data = data # muxing different streams if restart_streams: streams = [ pescador.Streamer(generate_rnd_chunk, track['path'], track['len'], seq_len, normalize, transform) for track in data ] self.mux = pescador.ShuffledMux(streams) else: streams = [ pescador.Streamer(generate_chunk, track['path'], track['len'], seq_len, hop, normalize, transform) for track in data ] self.mux = pescador.StochasticMux(streams, len(streams), rate=None, mode='exhaustive')
def data_generator(directories, sampler, k, rate, batch_size=16, slices=None, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for working in directories: for track in tqdm(find_files(working,ext='h5')): fname = os.path.join(working,track) seeds.append(data_sampler(fname, sampler, slices)) # Send it all to a mux mux = pescador.StochasticMux(seeds, k, rate, mode='with_replacement', **kwargs) return pescador.buffer_stream(mux, batch_size, axis=0)
def data_generator(working, tracks, sampler, k, augment=True, rate=8, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for track in tracks: fname = os.path.join(working, os.path.extsep.join([track, 'h5'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if augment: for fname in sorted(glob(os.path.join(working, '{}.*.h5'.format(track)))): seeds.append(pescador.Streamer(data_sampler, fname, sampler)) # Send it all to a mux return pescador.StochasticMux(seeds, k, rate, **kwargs)
def keras_generator(data_list, input_patch_size, batch_size=16, active_str=200, muxrate=20): """Generator to be passed to a keras model """ streams = [] for fpath_in, fpath_out in data_list: print("Data list shape is {}".format(len(data_list))) streams.append( pescador.Streamer( patch_generator, fpath_in, fpath_out, input_patch_size=input_patch_size ) ) stream_mux = pescador.StochasticMux(streams, active_str, rate=muxrate, mode='with_replacement', random_state=RANDOM_STATE) batch_generator = pescador.buffer_stream(stream_mux, batch_size) for batch in batch_generator: print("\n Batch length: ".format(len(batch['X1']))) yield [batch['X1'], batch['X2']], batch['Y']
def data_generator(working, tracks, sampler, k, batch_size=32, slices=None, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for track in tqdm(tracks): fname = working + os.path.extsep.join([str(track), 'h5']) seeds.append( pescador.Streamer(data_sampler, fname, sampler, slices=slices)) # Send it all to a mux '''updated!''' mux = pescador.StochasticMux(seeds, k, **kwargs) if batch_size == 1: return mux else: return pescador.buffer_stream(mux, batch_size, axis=0)
############################################## # Put it all together ############################################## input_shape, (X_train, Y_train), (X_test, Y_test) = setup_data() steps_per_epoch = len(X_train) // batch_size # Create two streams from the same data, where one of the streams # adds a small amount of Gaussian noise. You could easily perform # other data augmentations using the same 'map' strategy. stream = sampler(X_train, Y_train) noisy_stream = additive_noise(stream, 'X') # Multiplex the two streamers together. mux = pescador.StochasticMux( [stream, noisy_stream], # Two streams, always active. n_active=2, # We want to sample from each stream infinitely. rate=None) # Buffer the stream into minibatches. batches = pescador.buffer_stream(mux, batch_size) model = build_model(input_shape) try: print("Start time: {}".format(datetime.datetime.now())) model.fit_generator(pescador.tuples(batches, 'X', 'y'), steps_per_epoch=steps_per_epoch, epochs=epochs, verbose=1, validation_data=(X_test, Y_test)) except KeyboardInterrupt:
# First, let's make a simple generator that makes an infinite # sequence of a given letter. def letter(c): while True: yield c # Let's make the two populations of streamers pop1 = [pescador.Streamer(letter, c) for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'] pop2 = [pescador.Streamer(letter, c) for c in 'abcdefghijklmnopqrstuvwxyz'] # We'll sample population 1 with 3 streamers active at any time. # Each streamer will generate, on average, 5 samples before being # replaced. mux1 = pescador.StochasticMux(pop1, 3, 5) # Let's have 5 active streamers for population 2, and replace # them after 2 examples on average. mux2 = pescador.StochasticMux(pop2, 5, 2) #################### # Mux composition #################### # We multiplex the two populations using a ShuffledMux. # The ShuffledMux keeps all of its input streamers active, # and draws samples independently at random from each one. # This should generate an approximately equal number of upper- and # lower-case letters, with more diversity among the lower-case letters. hier_mux = pescador.ShuffledMux([mux1, mux2])
config['classes_vector'], label2ids_train, label2ids_val, config) [ids_train, ids_val, _] = tmp_data # pescador train: define streamer train_pack = [ config, config['train_sampling'], config['param_train_sampling'] ] train_streams = [ pescador.Streamer(data_gen, id, id2audio_repr_path[id], id2gt_train[id], train_pack) for id in ids_train ] train_mux_stream = pescador.StochasticMux( train_streams, n_active=config['batch_size'] * 2, rate=None, mode='exhaustive') train_batch_streamer = pescador.Streamer( pescador.buffer_stream, train_mux_stream, buffer_size=config['batch_size'], partial=True) # pescador val: define streamer val_batch_size = np.min([len(ids_val), config['val_batch_size']]) val_pack = [config, 'overlap_sampling', 1] val_streams = [ pescador.Streamer(data_gen, id, id2audio_repr_path[id], id2gt_val[id], val_pack) for id in ids_val ]
# previously used streamers to be re-activated. # # For epoch-based sampling, we will use `exhaustive` mode to ensure # that streamers are not reactivated within the epoch. # # Since each data stream produces exactly `M` examples, this would lead # to a finite sample stream (i.e., only one epoch). # To prevent the mux from exiting after the first epoch, we'll use `cycle` mode. # k = 100 # or however many streamers you want simultaneously active # We'll use `rate=None` here so that the number of samples per stream is # determined by the streamer (`M`) and not the mux. mux = pescador.StochasticMux(streams, k, rate=None, mode='exhaustive') epoch_stream = mux(cycle=True) #################### # The `epoch_stream` will produce an infinite sequence of iterates. # The same samples are presented (in random order) in the # first `N*M`, second `N*M`, etc. disjoint sub-sequences, each # of which may be considered as an *epoch*. # # *NOTE*: for this approach to work with something like `keras`'s # `fit_generator` method, you need to be able to explicitly calculate # the duration of an epoch, which means that the number of samples # per streamer (`M` here) must be known in advance. #
def multiplex_tfr(data_dir, n_hops, batch_size, mode="inference", aug_kind_str="none", tfr_str="logmelspec", label_inputs=False, partial_labels=True, structured=True, active_streamers=32, streamer_rate=1024, num_cpus=1, multi_label=False, align_perturb=False, single_output="fine"): tfr_dir = os.path.join(data_dir, tfr_str) streams = [] # Parse augmentation kind string (aug_kind_str). if mode == "train": if aug_kind_str == "none": augs = ["original"] elif aug_kind_str == "pitch": augs = ["original", "pitch"] elif aug_kind_str == "stretch": augs = ["original", "stretch"] elif aug_kind_str == "all-but-noise": augs = ["original", "pitch", "stretch"] else: if aug_kind_str == "all": augs = ["original", "pitch", "stretch", "noise"] elif aug_kind_str == "noise": augs = ["original", "noise"] else: raise ValueError('Invalid augmentation kind: {}'.format(aug_kind_str)) # Generate a Pescador streamer for every HDF5 container, that is, # every unit-augmentation-instance triplet. aug_dict = get_augmentations() aug_list = [] class_list = [] class_count = Counter() for aug_str in augs: if aug_str == "original": instances = [aug_str] else: n_instances = aug_dict[aug_str] instances = ["-".join([aug_str, str(instance_id+1)]) for instance_id in range(n_instances)] if aug_str == "noise" and tfr_str == "logmelspec": bias = np.float32(-17.0) else: bias = np.float32(0.0) for instanced_aug_str in instances: aug_dir = os.path.join(tfr_dir, instanced_aug_str) lms_name = "_".join(["*", instanced_aug_str]) lms_pattern = os.path.join(aug_dir, lms_name + ".h5*") for lms_path in glob.glob(lms_pattern): if not is_valid_data_hdf5(lms_path, partial_labels): continue taxonomy_code = os.path.splitext(os.path.basename(lms_path))[0].split('_')[1].replace('-', '.') triplet = annotations.get_taxonomy_code_idx_triplet(taxonomy_code) coarse_idx, medium_idx, fine_idx = triplet if structured or single_output == "fine": bal_idx = fine_idx elif single_output == "medium": bal_idx = medium_idx elif single_output == "coarse": bal_idx = coarse_idx else: raise ValueError("Invalid single output mode:{}".format(single_output)) class_list.append(bal_idx) class_count[bal_idx] += 1 aug_list.append(aug_str) stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias, tfr_str, mode, label_inputs, multi_label, align_perturb) streams.append(stream) num_streamers = len(streams) num_fine_classes = len(class_count) num_aug = len([k for k in aug_dict.keys() if k != "original"]) class_weights = {cls: (num_streamers / float(num_fine_classes * count)) for cls, count in class_count.items()} aug_weights = {aug: 1.0 if aug == "original" else 1.0 / num_aug for aug, n_inst in aug_dict.items()} # Weight examples to balance for class, such that each file is sampled from evenly per class. Additionally, # Balance so sampling any augmentation type (or original) is equally likely, despite the number of instances # per augmentation. Within augmentation types, instances are equally likely. weights = [class_weights[cls] * aug_weights[aug] for cls, aug in zip(class_list, aug_list)] # Multiplex streamers together. if num_cpus > 1: mux = create_zmq_mux(streams, num_cpus, active_streamers, streamer_rate, weights=weights) else: mux = pescador.StochasticMux(streams, n_active=active_streamers, rate=streamer_rate, weights=weights) # Create buffered streamer with specified batch size. buffered_streamer = pescador.maps.buffer_stream(mux, batch_size) else: # If not dealing with augmentations, just go through all HDF5 files weights = None bias = np.float32(0.0) for fname in os.listdir(data_dir): lms_path = os.path.join(data_dir, fname) if not is_valid_data_hdf5(lms_path, partial_labels): continue stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias, tfr_str, mode, label_inputs, multi_label, align_perturb) streams.append(stream) # Multiplex streamers together, but iterate exhaustively. mux = pescador.ChainMux(streams, mode='exhaustive') # Create buffered streamer with specified batch size. buffered_streamer = cycle_partial_buffer_stream(mux, batch_size) inputs = ["tfr_input"] if mode in ('train', 'valid') and structured and label_inputs: inputs += ["coarse_label_input", "medium_label_input"] if structured: outputs = ["y_coarse", "y_medium", "y_fine"] else: outputs = ["y_" + single_output] return pescador.maps.keras_tuples(buffered_streamer, inputs=inputs, outputs=outputs)
i = np.random.randint(0, n) yield {'X': X[i], 'Y': y[i]} streams = [npz_generator(x) for x in datasets] ############################################## # Option 1: Stream equally from each dataset ############################################## # If you can easily fit all the datasets in memory and you want to # sample from then equally, you would set up your Mux as follows: mux = pescador.StochasticMux( streams, # Three streams, always active. n_active=len(streams), # We want to sample from each stream infinitely, # so we turn off the rate parameter, which # controls how long to sample from each stream. rate=None) ############################################## # Option 2: Sample from one at a time. ############################################## # Another approach might be to restrict sampling to one stream at a time. # Now, the rate parameter controls (statistically) how long to sample # from a stream before activating a new stream. mux = pescador.StochasticMux( streams, # Only allow one active stream n_active=1,
# load ground truth FILE_GROUND_TRUTH_TEST = config_file.DATA_FOLDER + 'index/' + DATASET + '/gt_' + DATASET + '_fold' + str( config['fold']) + '_test.tsv' [ids, id2gt] = shared.load_id2gt(FILE_GROUND_TRUTH_TEST) [_, id2label] = shared.load_id2label(FILE_GROUND_TRUTH_TEST) print(FILE_GROUND_TRUTH_TEST) # pescador: define (finite, batched & parallel) streamer pack = [config, 'overlap_sampling', 1] streams = [ pescador.Streamer(sl_train.data_gen, id, id2audio_repr_path[id], id2gt[id], pack) for id in ids ] mux_stream = pescador.StochasticMux(streams, n_active=TEST_BATCH_SIZE * 2, rate=None, mode='exhaustive') batch_streamer = pescador.Streamer(pescador.buffer_stream, mux_stream, buffer_size=TEST_BATCH_SIZE) #batch_streamer = pescador.ZMQStreamer(batch_streamer) # tensorflow: define model and cost [x, y_, is_train, y, normalized_y, cost] = sl_train.tf_define_model_and_cost(config) # tensorflow: compute the accuracy of each model accuracies = [] fgt = open(experiment_folder + 'models.list') for model_name in fgt.readlines(): print(model_name)
def generate_urls( queries: Dict, label: Optional[str] = None, split_streams_by: Optional[Union[str, List]] = None, subset_streams: Optional[Union[str, Dict]] = None, nb_samples_per_stream: Optional[int] = None, nb_samples: Optional[int] = None, weighted_streams: bool = False, cache_requests: bool = False, mediatype: str = "StillImage", license_info: bool = True, one_media_per_occurrence: bool = True, verbose: bool = False, ): """Provides url generator from given query Args: queries (Dict): dictionary of queries supported by the GBIF api label (str, optional): Output label name. Defaults to `None` which yields all metadata. nb_samples (int): Limit the total number of samples retrieved from the API. When set to -1 and `split_streams_by` is not `None`, a minimum number of samples will be calculated from using the number of available samples per stream. Defaults to `None` which retrieves all samples from all streams until all streams are exchausted. nb_samples_per_stream (int): Limit the maximum number of items to be retrieved per stream. Defaults to `None` which retrieves all samples from stream until stream generator is exhausted. split_streams_by (Optional[Union[str, List]], optional): Stream identifiers to be balanced. Defaults to None. subset_streams (Optional[Union[str, Dict]], optional): Map certain streams into separate subsets, by setting the `subset` metadata. Supports a remainder value of `"*"` which acts as a wildcard. Defaults to None. E.g. `subset_streams={"train": { "speciesKey": [5352251, 3190653]}, "test": { "speciesKey": "*" }}` will move species of 5352251 and 3190653 into `train` whereas all other species will go into test. weighted_streams (int): Calculates sampling weights for all streams and applies them during sampling. To be combined with nb_samples not `None`. Defaults to `False`. cache_requests (bool, optional): Enable GBIF API cache. Can significantly improve API requests. Defaults to False. mediatype (str): supported GBIF media type. Can be `StillImage`, `MovingImage`, `Sound`. Defaults to `StillImage`. license_info (bool): retrieve images license information. Default to True. one_media_per_occurrence (bool): only retrieve one media in multiple media occurrences. Default to True, Returns: Iterable: generate-like object, that yields dictionaries """ streams = [] # set pygbif api caching pygbif.caching(cache_requests) # copy queries since we delete keys from the dict q = queries.copy() # if weighted_streams and nb_samples_per_stream is not None: # raise RuntimeError("weights can only be applied when the number of samples are limited.") # Split queries into product of streamers if split_streams_by is not None: balance_queries = {} # if single string is provided, covert into list if isinstance(split_streams_by, str): split_streams_by = [split_streams_by] # remove balance_by from query and move to balance_queries for key in split_streams_by: balance_queries[key] = q.pop(key) # for each b in balance_queries, create a separate stream # later we control the sampling processs of these streams to balance them for b in _dproduct(balance_queries): subset = None # for each stream we wrap into pescador Streamers for additional features for key, value in b.items(): if subset_streams is not None: for x, y in subset_streams.items(): result = y.get(key) if result is not None: if isinstance(result, list): for item in result: if value == item: subset = x else: if value == result: subset = x # assign remainder class if result == "*" and subset is None: subset = x streams.append( pescador.Streamer( pescador.Streamer( gbif_query_generator, label=label, mediatype=mediatype, subset=subset, license_info=license_info, one_media_per_occurrence=one_media_per_occurrence, **q, **b, ), # this makes sure that we only obtain a maximum number # of samples per stream max_iter=nb_samples_per_stream, )) if verbose: nb_queries = [ gbif_count(mediatype=mediatype, **q, **b) for b in _dproduct(balance_queries) ] print(sum(nb_queries)) # count the available occurances for each stream and select the min. # We only yield the minimum of streams to balance if nb_samples == -1: # calculate the miniumum number of samples available per stream nb_samples = min([ gbif_count(mediatype=mediatype, **q, **b) for b in _dproduct(balance_queries) ]) * len(streams) if weighted_streams: weights = np.array([ float(gbif_count(mediatype=mediatype, **q, **b)) for b in _dproduct(balance_queries) ]) weights /= np.max(weights) else: weights = None mux = pescador.StochasticMux( streams, n_active=len(streams), # all streams are always active. rate=None, # all streams are balanced weights=weights, # weight streams mode="exhaustive", # if one stream fails it is not revived ) return mux(max_iter=nb_samples) # else there will be only one stream, hence no balancing or sampling else: if nb_samples and nb_samples_per_stream: nb_samples = min(nb_samples, nb_samples_per_stream) if verbose: print(nb_samples) return pescador.Streamer( gbif_query_generator, label=label, mediatype=mediatype, license_info=license_info, one_media_per_occurrence=one_media_per_occurrence, **q, ).iterate(max_iter=nb_samples)