def normalize(self, data): ''' Normalize data based on normalize_mode ''' assert len(data.shape) == 4 if self.normalize_mode == '12': mean = data.mean(axis=(2, 3), dtype=np.float32, keepdims=True) std = data.std(axis=(2, 3), dtype=np.float32, keepdims=True) data = np.nan_to_num((data - mean)/std) elif self.normalize_mode == '3': shape = data.shape temp_data = data.reshape((-1, (192*224*192)//data.shape[2]//data.shape[3], 2, data.shape[2], data.shape[3])) mean = temp_data.mean(axis=1, dtype=np.float32, keepdims=True) std = temp_data.std(axis=1, dtype=np.float32, keepdims=True) data = np.nan_to_num((temp_data - mean)/std).reshape(shape) elif self.normalize_mode == '123': shape = data.shape temp_data = data.reshape((-1, (192*224*192)//data.shape[2]//data.shape[3], 2, data.shape[2], data.shape[3])) mean = temp_data.mean(axis=1, dtype=np.float32, keepdims=True) std = temp_data.std(axis=1, dtype=np.float32, keepdims=True) data = np.nan_to_num((temp_data - mean) / std).reshape(shape) mean = data.mean(axis=(2, 3), dtype=np.float32, keepdims=True) std = data.std(axis=(2, 3), dtype=np.float32, keepdims=True) data = np.nan_to_num((data - mean)/std) return data
def remove_season(data, standardize=True, mean=None, std=None): # Function to remove seasonality from data # Returns de-seasonalized data with same shape as input if mean is None: mean = data.mean(dim='year') std = data.std(dim='year') if standardize: data = (data - data.mean(dim='year')) / data.std(dim='year') else: data = data - data.mean(dim='year') return data, mean, std
def get_data_ch11(batch_size=10, n=1500): data = np.genfromtxt(d2l.download('airfoil'), dtype=np.float32, delimiter='\t') data = torch.from_numpy((data - data.mean(axis=0)) / data.std(axis=0)) data_iter = d2l.load_array((data[:n, :-1], data[:n, -1]), batch_size, is_train=True) return data_iter, data.shape[1]-1
def dataSetStatistics(data_dir, batch_size, num_data): # Detect if we have a GPU available # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # print('Current device: '+str(device)) transform = transforms.Compose([transforms.ToTensor()]) # img_list = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] img_list = [] for item in listdir( data_dir ): # /var/scratch/jfeins1/resnet-binary/fold0/train/ item= 1 or 3 if isfile(join(data_dir, item) ): # /var/scratch/jfeins1/resnet-binary/fold0/train/1/ FALSE img_list.append(item) elif isdir(join(data_dir, item) ): # /var/scratch/jfeins1/resnet-binary/fold0/train/1/ TRUE update_data_dir = join(data_dir, item) for f in listdir( update_data_dir ): # /var/scratch/jfeins1/resnet-binary/fold0/train/1/ f= 5iune00 or 3ir5a00 if isfile( join(update_data_dir, f) ): # /var/scratch/jfeins1/resnet-binary/fold0/train/1/5iune00 FALSE img_list.append(item + '/' + f) elif isdir( join(update_data_dir, f) ): # /var/scratch/jfeins1/resnet-binary/fold0/train/1/5iune00 TRUE deeper_data_dir = join( update_data_dir, f ) # deeper = /var/scratch/jfeins1/resnet-binary/fold0/train/1/5iune00 for y in listdir(deeper_data_dir): if isfile(join(deeper_data_dir, y)): img_list.append(item + '/' + f + '/' + y) dataset = UnsuperviseDataset(data_dir, img_list, transform=transform) total = dataset.__len__() print('length of entire dataset:', total) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=16) # calculate mean and std for training data mean = 0. std = 0. m = 0 for data, _ in dataloader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) # reshape mean = mean + data.mean(2).sum(0) std = std + data.std(2).sum(0) m = m + batch_samples if m > num_data: break mean = mean / m std = std / m #print('mean:',mean) #print('std:',std) return mean, std
def dataSetStatistics(data_dir, batch_size, num_data): # Detect if we have a GPU available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('Current device: '+str(device)) transform = transforms.Compose([transforms.ToTensor()]) img_list = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] dataset = UnsuperviseDataset(data_dir, img_list, transform=transform) total = dataset.__len__() print('length of entire dataset:', total) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=16) # calculate mean and std for training data mean = 0. std = 0. m = 0 for data, _ in dataloader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) # reshape mean = mean + data.mean(2).sum(0) std = std + data.std(2).sum(0) m = m + batch_samples if m > num_data: break mean = mean / m std = std / m print('mean:',mean) print('std:',std) return mean, std
def prepare(self, *select): """ Args: *select: Returns: """ datafile, labelfile = self.files(*select) data_filepath = os.path.join(self.root, datafile) label_filepath = os.path.join(self.root, labelfile) data = [] target = [] with open(data_filepath) as data_f, open(label_filepath) as label_f: for x, y in zip(data_f, it.islice(label_f, self.sync_files, None)): data.append(list(map(int, x.split()))) target.append(int(y)) data = torch.Tensor(data) target = torch.Tensor(target) if self.stardardize: data_mean = data.mean(dim=0, keepdim=True) data_std = data.std(dim=0, keepdim=True) data = (data - data_mean) / data_std return data, target
def calculate_mean_std_dataset(loader): mean_d = 0. std_d = 0. mean_l = 0. std_l = 0. nb_samples = 0. for data, label in loader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) mean_d += data.mean(2).sum(0) std_d += data.std(2).sum(0) nb_samples += batch_samples label = label.view(batch_samples, label.size(1), -1) mean_l += label.mean(2).sum(0) std_l += label.std(2).sum(0) mean_d /= nb_samples std_d /= nb_samples mean_l /= nb_samples std_l /= nb_samples print("Data Mean: ", mean_d) print("Data Std: ", std_d) print("Data Mean: ", mean_l) print("Data Std: ", std_l) return mean_d, std_d, mean_l, std_l
def plot_dist_with_stats(data, labels=None, title='Distribution of ECG Signal', ax=None, stats=True): mean = data.mean(skipna=True) std = data.std(skipna=True) if ax is None: fig, ax = plt.subplots() sns.distplot(data, bins=200, fit=norm, kde=True, ax=ax, norm_hist=True, hist=True) if stats: ax.axvline(mean.item(), color='w', linestyle='dashed', linewidth=2) ax.axvline(std.item(), color='r', linestyle='dashed', linewidth=2) ax.axvline(-std.item(), color='r', linestyle='dashed', linewidth=2) ax.set_xlabel("Samples") ax.set_ylabel("Probability density") ax.set_title(title) ax.text(-7, 0.1, "Extreme negatives") ax.text(7, 0.1, "Extreme positives") if labels is not None: plt.legend(labels=labels) plt.show() return ax
def load_sample(fname, normalize=True): from scipy.io.wavfile import read mat = read(fname)[1] mat = np.float32(mat) data = mat.squeeze()[None] if normalize: data = (data - data.mean()) / data.std() return data
def __getitem__(self, index): fpath = os.path.join(self.wav_dir, self.df.fname[index]) y, sr = librosa.load(fpath, sr=self.sr) if sr is None: print('WARNING:', fpath) sr = 44100 # ランダムクロップ y = random_crop(y, int(self.max_length * sr)) # 特徴抽出 n_fft = int(self.window_size * sr) hop_length = int(self.hop_size * sr) if self.feature == 'mfcc': feature = librosa.feature.mfcc(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mfcc=self.n_feature) elif self.feature == 'melgram': feature = librosa.feature.melspectrogram(y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=self.n_feature) else: print('Invalid feature name: %s' % self.feature) exit(1) data = torch.from_numpy(feature).float() s = data.size() if self.model_type == 'alex2d' or self.model_type == 'resnet': # Conv2dの場合は (channel, features, frames) data.resize_(1, s[0], s[1]) elif self.model_type == 'alex1d' or self.model_type == 'lstm': # Conv1dの場合は (features, frames) data.resize_(s[0], s[1]) else: print('Invalid conv type: %s' % self.model_type) exit(1) mean = data.mean() std = data.std() if std != 0: data.add_(-mean) data.div_(std) if self.test: # テストモードのときは正解ラベルがないのでデータだけ返す return data else: # label label = self.df.label_idx[index] return data, label
def normalize_dataset(data, normalizer, column_wise=False): if normalizer == 'max01': if column_wise: minimum = data.min(axis=0, keepdims=True) maximum = data.max(axis=0, keepdims=True) else: minimum = data.min() maximum = data.max() scaler = MinMax01Scaler(minimum, maximum) data = scaler.transform(data) print('Normalize the dataset by MinMax01 Normalization') elif normalizer == 'max11': if column_wise: minimum = data.min(axis=0, keepdims=True) maximum = data.max(axis=0, keepdims=True) else: minimum = data.min() maximum = data.max() scaler = MinMax11Scaler(minimum, maximum) data = scaler.transform(data) print('Normalize the dataset by MinMax11 Normalization') elif normalizer == 'std': if column_wise: mean = data.mean(axis=0, keepdims=True) std = data.std(axis=0, keepdims=True) else: mean = data.mean() std = data.std() scaler = StandardScaler(mean, std) data = scaler.transform(data) print('Normalize the dataset by Standard Normalization') elif normalizer == 'None': scaler = NScaler() data = scaler.transform(data) print('Does not normalize the dataset') elif normalizer == 'cmax': #column min max, to be depressed #note: axis must be the spatial dimension, please check ! scaler = ColumnMinMaxScaler(data.min(axis=0), data.max(axis=0)) data = scaler.transform(data) print('Normalize the dataset by Column Min-Max Normalization') else: raise ValueError return data, scaler
def calculate_mean_std_dataset(loader): mean = 0. std = 0. nb_samples = 0. for data in loader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) mean += data.mean(2).sum(0) std += data.std(2).sum(0) nb_samples += batch_samples mean /= nb_samples std /= nb_samples
def get_data_statistics(data_loader): mean = 0. std = 0. nb_samples = 0. for (data, labels) in data_loader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) mean += data.mean(2).sum(0) std += data.std(2).sum(0) nb_samples += batch_samples mean /= nb_samples std /= nb_samples return mean, std
def get_mean_std(loader): mean = 0. std = 0. nb_samples = 0. for data, _, _ in loader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) mean += data.mean(2).sum(0) std += data.std(2).sum(0) nb_samples += batch_samples mean /= nb_samples std /= nb_samples return mean, std
def mean_and_std(self) -> Tuple[float, float]: loader = DataLoader(self.subsets['train'], batch_size=10, num_workers=1, shuffle=False) mean = torch.full((3, ), 0.0) std = torch.full((3, ), 0.0) nb_samples = 0. for data, gt in loader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) mean += data.mean(2).sum(0) std += data.std(2).sum(0) nb_samples += batch_samples mean /= nb_samples std /= nb_samples return mean, std
def loss_values_stat(self, loss_vales): """ 一组loss损失的统计分析 :param loss_vales: 一次batch中,多份样本产生的误差数据 :return: 统计信息文本字符串 """ if not loss_vales: raise ValueError data = np.array(loss_vales, dtype=float) n, sum_ = len(data), data.sum() mean, std = data.mean(), data.std() msg = f'total_loss={sum_:.3f}, mean±std={mean:.3f}±{std:.3f}({max(data):.3f}->{min(data):.3f})' if sum_ < self.min_total_loss: self.min_total_loss = sum_ msg = '*' + msg return msg
def computeStatistics(loader): mean = 0. std = 0. nb_samples = 0. for data in loader: batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) mean += data.mean(2).sum(0) std += data.std(2).sum(0) nb_samples += batch_samples mean /= nb_samples std /= nb_samples return mean, std
def process_mnist(self, mnist: torch.utils.data.Dataset, labels_keep: tuple): data = [] targets = [] for image, label_old in tqdm(mnist, desc=f"Preparing {self.__class__.__name__} dataset"): if label_old in labels_keep: label_new = labels_keep.index(label_old) targets.append(label_new) data.append(image) data = torch.cat(data, dim=0) data_mean = data.mean(dim=0) data_std = data.std(dim=0) data = (data - data_mean) / data_std targets = torch.LongTensor(targets) data_path = self.get_data_path() data_path.parent.mkdir(exist_ok=True, parents=True) with open(data_path, 'wb') as f: torch.save((data, targets), f) print(f"Saved preprocessed data to {data_path}")
def dataSetStatistics(data_dir, batch_size): """ Calculate the statistics of the dataset """ image_size = (256, 256) transform = transforms.Compose([transforms.Resize(image_size), transforms.ToTensor()]) #transform = transforms.Compose([transforms.ToTensor()]) dataset = torchvision.datasets.ImageFolder(data_dir, transform=transform, target_transform=None) m = dataset.__len__() print('length of entire dataset:', m) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=16) # calculate mean and std for training data mean = 0. std = 0. # m = 0 # number of samples for data,data_label in dataloader: # print(data) batch_samples = data.size(0) data = data.view(batch_samples, data.size(1), -1) # reshape mean = mean + data.mean(2).sum(0) std = std + data.std(2).sum(0) # m = m + batch_samples mean = mean / m std = std / m print('mean:',mean) print('std:',std) return mean, std
def get_dataloader(dataset, batch_size=128, window=12, horizon=1, val_days=10, test_days=10, normalizer='max'): if dataset == 'SYDNEY': data = Load_Sydney_Demand_Data( os.path.join(base_dir, '1h_data_new3.csv')) print(data.shape) print('Load Sydney Dataset Successfully!') if normalizer == 'max': scaler = MinMaxScaler(data.min(), data.max()) data = scaler.transform(data) print('Normalize the dataset by MinMax Normalization') elif normalizer == 'std': scaler = StandardScaler(data.mean(), data.std()) data = scaler.transform(data) print('Normalize the dataset by Standard Normalization') else: scaler = None X, Y = Add_Window_Horizon(data, window, horizon) print(X.shape, Y.shape) x_tra, x_val, x_test = split_train_val_test(X, val_days, test_days) y_tra, y_val, y_test = split_train_val_test(Y, val_days, test_days) print(x_tra.shape, y_tra.shape) print(x_val.shape, y_val.shape) print(x_test.shape, y_test.shape) train_dataloader = data_loader(x_tra, y_tra, batch_size, 'train') val_dataloader = data_loader(x_val, y_val, batch_size, 'val') test_dataloader = data_loader(x_test, y_test, batch_size, 'test') dataloader = data_loader(X, Y, batch_size, 'all') return train_dataloader, val_dataloader, test_dataloader, scaler
def prepare(self): """ Make torch Tensors from data and label files. Returns: """ datafile = self.urls[0].rpartition('/')[2] data_filepath = os.path.join(self.root, datafile) data = [] target = [] with open(data_filepath) as data_f: for sample in data_f: x, y, label = tuple(map(float, sample.split())) data.append([x, y]) target.append(int(label) - 1) data = torch.Tensor(data) target = torch.Tensor(target) if self.stardardize: data_mean = data.mean(dim=0, keepdim=True) data_std = data.std(dim=0, keepdim=True) data = (data - data_mean) / data_std return data, target
def wav_preprocess(self, data): ''' augmentation: 1. Deep Convolutional Neural Networks and Data Augmentation for Environmental Sound Classification 2. http://www.mirlab.org/conference_papers/International_Conference/ISMIR%202015/website/articles_splitted/264_Paper.pdf - dynamic range compression - add background sounds - added noise and dropout - random filters - loudness (to spectrogram) - pitch shift +-10% ??? - tempo shift ??? ''' # == resample # if np.random.choice([True, False, False]): # resample_rate = np.random.choice([0.9, 1.1]) # if resample_rate != 1: # data = librosa.resample(data, self.sr, (self.sr * resample_rate)) # == time stretch +-20% # if np.random.choice([True, False, False]): # stretch_rate = np.random.rand() * 0.4 + 0.8 # data = librosa.effects.time_stretch(data, stretch_rate) # positive - faster # data = librosa.feature.mfcc(y=data, sr=self.sr) # data = librosa.feature.melspectrogram(data, sr=self.sr, n_fft=1024, hop_length=256) # data = np.log10(data + 1e-6) # data = data - data.mean() data = spectrum(data, self.sr) # amplify spectrogram if np.random.choice([True, True]): amplify = np.random.rand() * 0.6 + 0.7 # +-30% data = data * amplify # add gaussian noise if np.random.choice([True, True]): noise_level = np.random.rand() * data.std() * 1.0 data = data + np.random.randn(data.shape[0], data.shape[1]) * noise_level # zoom the spectrogram if np.random.choice([True, False]): zoom_len = np.random.rand() * 0.6 + 0.7 # 30% zoom_frq = 1 # np.random.rand() * 0.1 + 0.95 # 5% zoomed = zoom(data, (zoom_frq, zoom_len)) if zoom_frq >= 1: data = zoomed[0:128, 0:zoomed.shape[1]] else: data = np.random.randn(128, zoomed.shape[1]) data[0:zoomed.shape[0], 0:zoomed.shape[1]] = zoomed[:, :] pitch_aug = np.random.choice(['none', 'up', 'down']) # shift pitch UP if pitch_aug is 'up': shift = np.random.choice([3, 2, 1]) data[0:128 - shift, :] = data[shift:128, :] data[128 - shift:128, :] = np.random.randn(shift, data.shape[1]) # shift pitch DOWN if pitch_aug is 'down': shift = np.random.choice([1, 2, 3]) data[shift:128, :] = data[0:128 - shift, :] data[0:shift, :] = np.random.randn(shift, data.shape[1]) # add no more than 25% of bg sounds if self.add_bg: bg = self.backgrounds[ np.random.randint(0, self.backgrounds.shape[0]), :, :] bg = bg / bg.std() * data.std() if data.shape[1] < bg.shape[1]: data = data + bg[:, 0:data.shape[1]] * np.random.rand() * 0.25 else: data[:, 0:bg. shape[1]] = data[:, 0:bg. shape[1]] + bg * np.random.rand() * 0.25 # plt.imshow(data) # plt.show() return data
def from_data(cls, data): return cls(shift=data.mean(), scale=1.0 / (1e-5 + data.std()))
def get_data_ch7(): data = np.genfromtxt('Datasets/airfoil_self_noise.dat', delimiter='\t') data = (data - data.mean(axis=0)) / data.std(axis=0) return torch.tensor(data[:1500, :-1], dtype=torch.float32), torch.tensor(data[:1500, -1], dtype=torch.float32)
def normalize(data): man=data.mean(0) std=data.std(0) y=data-man z=y/std return z,man,std
def normalize(self, data): mean = data.mean(axis=(0, 1, 2)) / 255.0 std = data.std(axis=(0, 1, 2)) / 255.0 normalize = transforms.Normalize(mean=mean, std=std) return normalize
def normalize_(data: torch.Tensor): mean = data.mean(0) std = data.std(0) return (data - mean) / std, mean, std