def __call__(self, protocol, subset='train'): self.initialize(protocol, subset=subset) batch_size = self.batch_size batches_per_epoch = self.batches_per_epoch generators = [] if self.parallel: for i in range(self.parallel): generator = self.generator() batches = batchify(generator, self.signature, batch_size=batch_size, prefetch=batches_per_epoch) generators.append(batches) else: generator = self.generator() batches = batchify(generator, self.signature, batch_size=batch_size, prefetch=0) generators.append(batches) while True: # get `batches_per_epoch` batches from each generator for batches in generators: for _ in range(batches_per_epoch): yield next(batches)
def postprocess_ndarray(self, X): """Embed (fixed-length) sequences Parameters ---------- X : (batch_size, n_samples, n_features) numpy array Batch of input sequences Returns ------- fX : (batch_size, n_dimensions) numpy array Batch of sequence embeddings. """ batch_size, n_samples, n_features = X.shape # this test is needed because .apply() may be called # with a ndarray of arbitrary size as input if batch_size <= self.batch_size: X = torch.tensor(X, dtype=torch.float32, device=self.device) cpu = torch.device('cpu') return self.model(X).detach().to(cpu).numpy() # if X contains too large a batch, split it in smaller batches... batches = batchify(iter(X), {'@': (None, np.stack)}, batch_size=self.batch_size, incomplete=True, prefetch=0) # ... and process them in order, before re-concatenating them return np.vstack([self.postprocess_ndarray(x) for x in batches])
def __call__(self, protocol, subset='train'): """(Parallelized) batch generator""" # pre-load useful information about protocol once and for all self.initialize(protocol, subset=subset) # number of batches needed to complete an epoch batches_per_epoch = self.batches_per_epoch generators = [] if self.parallel: for _ in range(self.parallel): # initialize one sample generator samples = self.samples() # batchify it and make sure at least # `batches_per_epoch` batches are prefetched. batches = batchify(samples, self.signature, batch_size=self.batch_size, prefetch=batches_per_epoch) # add batch generator to the list of (background) generators generators.append(batches) else: # initialize one sample generator samples = self.samples() # batchify it without prefetching batches = batchify(samples, self.signature, batch_size=self.batch_size, prefetch=0) # add it to the list of generators # NOTE: this list will only contain one generator generators.append(batches) # loop on (background) generators indefinitely while True: for batches in generators: # yield `batches_per_epoch` batches from current generator # so that each epoch is covered by exactly one generator for _ in range(batches_per_epoch): yield next(batches)
def _get_batch_generator_y(self, data_h5): """Get batch generator Parameters ---------- data_h5 : str Path to HDF5 file containing precomputed sequences. It must have to aligned datasets 'X' and 'y'. Returns ------- batch_generator : iterable batches_per_epoch : int n_classes : int """ fp = h5py.File(data_h5, mode='r') h5_X = fp['X'] h5_y = fp['y'] # keep track of number of labels and rename labels to integers unique, y = np.unique(h5_y, return_inverse=True) n_classes = len(unique) index_generator = random_label_index(y, per_label=self.per_label, return_label=False) def generator(): while True: i = next(index_generator) yield {'X': h5_X[i], 'y': y[i]} signature = {'X': {'type': 'ndarray'}, 'y': {'type': 'ndarray'}} batch_size = self.per_batch * self.per_fold * self.per_label batch_generator = batchify(generator(), signature, batch_size=batch_size) batches_per_epoch = n_classes // (self.per_batch * self.per_fold) + 1 return { 'batch_generator': batch_generator, 'batches_per_epoch': batches_per_epoch, 'n_classes': n_classes, 'classes': unique }
def train(x_paths, y_paths, weights_dir, x_paths_audio=None): n_samples, n_positive = statistics(y_paths) # estimate performance of "majority class" baseline baseline = 100 * n_positive / n_samples print('Baseline = {0:.1f}%'.format(baseline)) # estimate number of batches per epoch steps_per_epoch = n_samples // BATCH_SIZE # create batch generator generator = get_generator(x_paths, y_paths) if CATEGORICAL: signature = ({'type': 'ndarray'}, {'type': 'ndarray'}) else: signature = ({'type': 'ndarray'}, {'type': 'scalar'}) batch_generator = batchify(generator, signature, batch_size=BATCH_SIZE) print(INPUT_DIMS) # create model if CATEGORICAL: model = StackedLSTM()((25, INPUT_DIMS)) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) else: model = StackedLSTM(final_activation="sigmoid", n_classes=1)((25, INPUT_DIMS)) #only 4 units #model = StackedLSTM(lstm=[4,],mlp=[],final_activation="sigmoid",n_classes=1)((25, INPUT_DIMS)) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc']) # train model model_h5 = weights_dir + '/{epoch:04d}.h5' callbacks = [ModelCheckpoint(model_h5, period=1)] model.fit_generator(batch_generator, steps_per_epoch, epochs=1000, verbose=1, callbacks=callbacks, workers=1)
def validate_ev(x_paths, y_paths, input_model): generator = get_generator(x_paths, y_paths, forever=False) signature = ({'type': 'ndarray'}, {'type': 'scalar'}) batch_generator = batchify(generator, signature, batch_size=BATCH_SIZE) Y_true, Y_pred = [], [] for X, y in batch_generator: # Y_pred.append(model.predict(X)[:, :, 1].reshape((-1, 1))) # Y_true.append(y[:, :, 1].reshape((-1, 1))) Y_pred.append(input_model.predict(X).reshape((-1, 1))) Y_true.append(y.reshape((-1, 1))) y_true = np.vstack(Y_true) #check y_pred values y_pred = np.vstack(Y_pred) return y_true, y_pred
def validate(x_paths, y_paths, weights_dir): epoch = 0 f = open(WEIGHTS_DIR + "/list_test_prec_rec_auc", 'w') while True: # sleep until next epoch is finished model_h5 = weights_dir + '/{epoch:04d}.h5'.format(epoch=epoch) if not os.path.isfile(model_h5): time.sleep(10) continue model = load_model(model_h5) generator = get_generator(x_paths, y_paths, forever=False) if CATEGORICAL: signature = ({'type': 'ndarray'}, {'type': 'ndarray'}) else: signature = ({'type': 'ndarray'}, {'type': 'scalar'}) batch_generator = batchify(generator, signature, batch_size=BATCH_SIZE) Y_true, Y_pred = [], [] for X, y in batch_generator: #Y_pred.append(model.predict(X)[:, :, 1].reshape((-1, 1))) #Y_true.append(y[:, :, 1].reshape((-1, 1))) Y_pred.append(model.predict(X).reshape((-1, 1))) Y_true.append(y.reshape((-1, 1))) y_true = np.vstack(Y_true) y_pred = np.vstack(Y_pred) #auc = roc_auc_score(y_true, y_pred, average='macro', sample_weight=None) auc = average_precision_score(y_true, y_pred, average='macro', sample_weight=None) print('#{epoch:04d} {auc:.4f}%'.format(epoch=epoch + 1, auc=100 * auc)) f.write("{},".format(auc)) f.flush() epoch += 1
def embed(self, embedding, X, internal=False): """Apply embedding on sequences Parameters ---------- embedding : keras.Model Current state of embedding network X : (n_sequences, n_samples, n_features) numpy array Batch of input sequences internal : bool, optional Set to True to return internal representation Returns ------- fX : (n_sequences, ...) numpy array Batch of embeddings. """ if internal: embed = K.function( [embedding.get_layer(name='input').input, K.learning_phase()], [embedding.get_layer(name='internal').output]) # split large batch in smaller batches if needed if len(X) > self.batch_size: batch_generator = batchify(iter(X), {'type': 'ndarray'}, batch_size=self.batch_size, incomplete=True) fX = np.vstack(embed([x, 0])[0] for x in batch_generator) else: fX = embed([X, 0])[0] else: fX = embedding.predict(X, batch_size=self.batch_size) return fX.astype(self.float_autograd_)
training_file, development_file, BATCH_SIZE, optimizer_name, output_path = process_arguments( ) training_no_samples, training_sequence_no_samples, validation_no_samples, validation_sequence_no_samples, \ validation_start, development_no_samples, development_sequence_no_samples, index_arr_train, index_arr_validate,index_array_dev = utils.set_no_samples(training_file,development_file,True,USE_VALIDATION,TRAINING_RATIO,VALIDATION_RATIO,SEQUENCE_LENGTH,STEP) # create batch generator signature = ({'type': 'ndarray'}, {'type': 'ndarray'}) training_generator = utils.lstm_generator( training_file, "training", validation_start, index_arr_train, index_arr_validate, SEQUENCE_LENGTH, STEP, FIRST_DERIVATIVE, SECOND_DERIVATIVE) batch_training_generator = batchify(training_generator, signature, batch_size=BATCH_SIZE) steps_per_epoch_train, _, _ = utils.calculate_steps_per_epoch( training_sequence_no_samples, 0, 0, batch_size=BATCH_SIZE) training_percentage = utils.compute_samples_majority_class( training_file, type="training", start=0, end=training_no_samples) print("Training set +ve label percentage: " + str(training_percentage)) if SAVE_SEQUENCES: validation_generator = utils.lstm_generator(training_file, "validation", validation_start, index_arr_train, index_arr_validate,
def _get_batch_generator_z(self, data_h5): """""" fp = h5py.File(data_h5, mode='r') h5_X = fp['X'] h5_y = fp['y'] h5_z = fp['z'] df = pd.DataFrame({'y': h5_y, 'z': h5_z}) z_groups = df.groupby('z') y_groups = [group.y.iloc[0] for _, group in z_groups] # keep track of number of labels and rename labels to integers unique, y = np.unique(y_groups, return_inverse=True) n_classes = len(unique) index_generator = random_label_index(y, per_label=self.per_label, return_label=True, repeat=False) def generator(): while True: # get next group i, label = next(index_generator) # select at most 10 sequences of current group selector = list(z_groups.get_group(i).index) selector = np.random.choice(selector, size=min(10, len(selector)), replace=False) X = np.array(h5_X[sorted(selector)]) n = X.shape[0] yield {'X': X, 'y': label, 'n': n} signature = { 'X': { 'type': 'ndarray' }, 'y': { 'type': 'scalar' }, 'n': { 'type': 'complex' } } batch_size = self.per_batch * self.per_fold * self.per_label batch_generator = batchify(generator(), signature, batch_size=batch_size) batches_per_epoch = n_classes // (self.per_batch * self.per_fold) + 1 return { 'batch_generator': batch_generator, 'batches_per_epoch': batches_per_epoch, 'n_classes': n_classes, 'classes': unique }
def _get_batch_generator_y(self, data_h5): """Get batch generator Parameters ---------- data_h5 : str Path to HDF5 file containing precomputed sequences. It must have to aligned datasets 'X' and 'y'. Returns ------- batch_generator : iterable batches_per_epoch : int n_classes : int """ fp = h5py.File(data_h5, mode='r') h5_X = fp['X'] h5_y = fp['y'] # keep track of number of labels and rename labels to integers unique, y = np.unique(h5_y, return_inverse=True) n_classes = len(unique) # iterates over sequences of class jC # in random order, and forever def class_generator(jC): indices = np.where(y == jC)[0] while True: np.random.shuffle(indices) for i in indices: yield i def generator(): centers = np.arange(n_classes) class_generators = [class_generator(jC) for jC in centers] previous_label = None while True: # loop over each centers in random order np.random.shuffle(centers) for iC in centers: try: # get "per_fold" closest centers to current centers distances = cdist(self.fC_[iC, np.newaxis], self.fC_, metric=self.metric)[0] except AttributeError as e: # when on_train_begin hasn't been called yet, # attribute fC_ doesn't exist --> fake it distances = np.random.rand(len(centers)) distances[iC] = 0. closest_centers = np.argpartition( distances, self.per_fold)[:self.per_fold] # corner case where last center of previous loop # is the same as first center of current loop if closest_centers[0] == previous_label: closest_centers[:-1] = closest_centers[1:] closest_centers[-1] = previous_label for jC in closest_centers: for _ in range(self.per_label): i = next(class_generators[jC]) yield {'X': h5_X[i], 'y': y[i]} previous_label = jC signature = {'X': {'type': 'ndarray'}, 'y': {'type': 'ndarray'}} batch_size = self.per_batch * self.per_fold * self.per_label batch_generator = batchify(generator(), signature, batch_size=batch_size) # each fold contains one center and its `per_fold` closest centers # therefore, the only way to be sure that we've seen every class in # one epoch is to go through `n_classes` folds, # i.e. n_classes / per_batch batches batches_per_epoch = n_classes // self.per_batch return { 'batch_generator': batch_generator, 'batches_per_epoch': batches_per_epoch, 'n_classes': n_classes, 'classes': unique }