def __init__(self, batch_size, processed_data_dirname, sigs_0d, sigs_1d, sigs_predict, train_or_val='train', shuffle='False', data_package=None): self.batch_size = batch_size self.shuffle = shuffle self.sigs_0d = sigs_0d self.sigs_1d = sigs_1d self.sigs_predict = sigs_predict if data_package is None: self.data = load_obj( os.path.join(processed_data_dirname, '{}_data'.format(train_or_val))) self.target = load_obj( os.path.join(processed_data_dirname, '{}_target'.format(train_or_val))) else: self.data = data_package['{}_data'.format(train_or_val)] self.target = data_package['{}_target'.format(train_or_val)] if train_or_val == "val": for sig in sigs_predict: baseline = np.mean(np.abs(self.target[sig])) #print("baseline mae for {} for each rho point: {}".format(sig, rho_baseline)) print("baseline mae averaged for {} over all rho points: {}". format(sig, baseline))
def __init__(self, batch_size, input_dir_name, train_or_val='train', shuffle='False'): self.batch_size = batch_size self.shuffle = shuffle self.data = np.array( helper_functions.load_obj(input_dir_name + train_or_val + '_data')) self.target = np.array( helper_functions.load_obj(input_dir_name + train_or_val + '_target'))
def __init__(self, batch_size, processed_data_dirname, train_or_val='train', shuffle='False', data_package=None): self.batch_size = batch_size self.shuffle = shuffle if data_package is None: self.data = load_obj( os.path.join(processed_data_dirname, '{}_data'.format(train_or_val))) self.target = load_obj( os.path.join(processed_data_dirname, '{}_target'.format(train_or_val))) else: self.data = data_package['{}_data'.format(train_or_val)] self.target = data_package['{}_target'.format(train_or_val)]
def __init__(self, batch_size, processed_data_dirname, train_or_val='train', shuffle='False', data_package=None): self.batch_size = batch_size self.shuffle = shuffle self.data = load_obj(os.path.join(processed_data_dirname, 'final_data'))
def __init__(self, batch_size, num_sigs, input_data_file, train_or_val='train', shuffle='False', lookback=30, delay=1): self.batch_size = batch_size self.num_sigs = num_sigs self.shuffle = shuffle self.lookback = lookback self.delay = delay separated_data = helper_functions.load_obj(input_data_file) separated_data = [ separated_data[key] for key in sorted(separated_data.keys()) ] k = 5 # 1/k of the data is used for validation fold = k - 1 # which fold of the data to use for validation num_val_samples = len(separated_data) // k if train_or_val == 'train': separated_data = separated_data[:fold * num_val_samples] + separated_data[ (fold + 1) * num_val_samples:] elif train_or_val == 'val': separated_data = separated_data[fold * num_val_samples:(fold + 1) * num_val_samples] else: raise ValueError( "Specify either 'train' or 'val' for variable 'train_or_val'") border_indices = np.cumsum([len(elem) for elem in separated_data]) border_indices = np.insert(border_indices, 0, 0, axis=0) data = [] for elem in separated_data: data.extend(elem) data = np.asarray(data) # i iterates over shots, j iterates over timesteps within shots separated_possible_indices = [ np.arange(border_indices[i] + self.lookback, border_indices[i + 1] - self.delay) for i in range(len(separated_data)) ] possible_indices = [] for elem in separated_possible_indices: possible_indices.extend(elem) self.data = data self.possible_indices = possible_indices
def preprocess_data(input_filename, output_dirname, sigs_0d, sigs_1d, sigs_predict, n_components=8, avg_window=10, lookback=10, delay=1, train_frac=.05, val_frac=.05, save_data=False, noised_signal=None, sigma=0.5, noised_signal_complete=None, sigma_complete=1): # n_components 0 means we don't want to include 1d signal in the input at all if (n_components == 0): sigs_1d = [] # Gaussian normalization, return 0 if std is 0 def normalize(obj, mean, std): a = obj - mean b = std return np.divide(a, b, out=np.zeros_like(a), where=b != 0) def finalize_signal(arr): arr[np.isnan(arr)] = 0 arr[np.isinf(arr)] = 0 return arr # load in the raw data data = load_obj(input_filename) #os.path.join(dirname,'final_data')) # extract all shots that are in the raw data so we can iterate over them shots = sorted(data.keys()) sigs = list(np.unique(sigs_0d + sigs_1d + sigs_predict)) # first get the indices that contain all the data we need # (both train and validation) all_shots = [] for shot in shots: if set(sigs).issubset(data[shot].keys()): if all([data[shot][sig].size != 0 for sig in sigs]): all_shots.append(shot) data_all_times = {} for sig in sigs + ['time']: data_all_times[sig] = np.array([data[shot][sig] for shot in all_shots]) data_all_times[sig] = np.concatenate(data_all_times[sig], axis=0) data_all_times[sig] = finalize_signal(data_all_times[sig]) data_all_times['shot'] = np.array([[shot] * data[shot][sigs[0]].shape[0] for shot in all_shots]) data_all_times['shot'] = np.concatenate(data_all_times['shot'], axis=0) indices = {} subsets = ['train', 'val'] train_shots = all_shots[:int(len(all_shots) * train_frac)] val_shots = all_shots[int(len(all_shots) * train_frac ):int(len(all_shots) * (train_frac + val_frac))] subset_shots = {'train': train_shots, 'val': val_shots} def get_first_ind(arr, val): return np.searchsorted(arr, val) + lookback def get_last_ind(arr, val): return np.searchsorted(arr, val, side='right') - delay for subset in subsets: indices[subset] = [ np.arange(get_first_ind(data_all_times['shot'], shot), get_last_ind(data_all_times['shot'], shot) + 1) for shot in subset_shots[subset] ] indices[subset] = np.concatenate(indices[subset]) means = {} stds = {} for sig in sigs: means[sig] = np.mean(data_all_times[sig][indices['train']], axis=0) stds[sig] = np.std(data_all_times[sig][indices['train']], axis=0) data_all_times_normed = {} for sig in sigs: data_all_times_normed[sig] = normalize(data_all_times[sig], means[sig], stds[sig]) target = {} input_data = {} times = {} for subset in subsets: final_target = {} for sig in sigs_predict: final_target[sig] = data_all_times_normed[sig][ indices[subset] + delay] - data_all_times_normed[sig][indices[subset]] target[subset] = np.concatenate( [final_target[sig] for sig in sigs_predict], axis=1) final_input = {} for sig in sigs_0d + sigs_1d: final_input[sig] = {} final_input[sig] = np.stack([ data_all_times_normed[sig][indices[subset] + offset] for offset in range(-lookback, delay + 1) ], axis=1) final_input_0d = np.concatenate( [final_input[sig][:, :, np.newaxis] for sig in sigs_0d], axis=2) final_input_1d = np.concatenate([final_input[sig] for sig in sigs_1d], axis=2) final_input_1d[:, -delay:, :] = pad_1d_to input_data[subset] = np.concatenate([final_input_0d, final_input_1d], axis=2) if save_data: for subset in subsets: save_obj(data_all_times['time'][indices[subset]], os.path.join(output_dirname, '{}_time'.format(subset))) save_obj(data_all_times['shot'][indices[subset]], os.path.join(output_dirname, '{}_shot'.format(subset))) save_obj(target[subset], os.path.join(output_dirname, '{}_target'.format(subset))) save_obj(input_data[subset], os.path.join(output_dirname, '{}_data'.format(subset))) save_obj(means, os.path.join(output_dirname, 'means')) save_obj(stds, os.path.join(output_dirname, 'stds')) # not yet implemented else: pass