def __init__(self, test=False, timit_version=None): co = Constants() self.u = Utilities() if timit_version: timit_version = timit_version.lower() if timit_version not in ['timit', 'ffmtimit']: raise ValueError('Bad timit version') if timit_version == 'timit': if test: self.SET_ROOT = co.TIMIT_TEST_ROOT else: self.SET_ROOT = co.TIMIT_TRAIN_ROOT elif timit_version == 'ffmtimit': if test: self.SET_ROOT = co.FFMTIMIT_TEST_ROOT else: self.SET_ROOT = co.FFMTIMIT_TRAIN_ROOT
from helper import Utilities, PerformanceEvaluation import pandas as pd import numpy as np from metric_learning import MetricLearning, Subsampling from user_feedback import Similarity import pickle from scipy.misc import comb import time """ In the demo, we will showcase an example of special purpose publication. The data user wants the published energy database to maximally retain the information about peak-time energy usage """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() mel = MetricLearning() # step 1: get the database to be published day_profile = pd.read_pickle('../dataset/dataframe_all_energy.pkl') day_profile = day_profile.iloc[ 0:90, 0:: 4] # subsample the database to improve the speed for demonstration purpose day_profile.index = range(len(day_profile.index)) rep_mode = 'mean' anonymity_level = 2 # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the # starting and ending time of the peak usage time interest = 'window-usage'
from subsampling import Subsampling from user_feedback import Similarity from scipy.misc import comb from deep_metric_learning import Deep_Metric import numpy as np import pickle from linear_metric_learning import Linear_Metric import pdb # """ # In the demo, we will showcase an example of special purpose publication. # The data user wants the published database to maximally retain the information about lunch time. # """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() day_profile_all = pd.read_pickle('./dataset/dataframe_all_binary.pkl') day_profile = day_profile_all.iloc[0:90, 0::60] # the database to be published day_profile_metric_learn = day_profile_all.iloc[ 90:-1, 0::60] # the database for learning distance metric day_profile.dropna() day_profile_metric_learn.dropna() rep_mode = 'mean' anonymity_level = 2 # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of a segment of entire time series. In this case, he/she would also need to specify the starting and # ending time of the time series segment of interest.
from helper import Utilities, PerformanceEvaluation import pandas as pd from user_feedback import Similarity from scipy.misc import comb from deep_metric_learning import Deep_Metric import numpy as np import pickle from linear_metric_learning import Linear_Metric from subsampling import Subsampling """ In the demo, we will showcase an example of special purpose publication. The data user wants the published database to maximally retain the information about lunch time. """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() def evaluation_occupancy_statistics(n, mode="arrival"): day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl') day_profile1 = day_profile1.fillna(0) day_profile1[day_profile1 > 0] = 1 res = 15 day_profile = day_profile1.iloc[:90, 0:: res] # subsample the database to improve the speed for demonstration purpose day_profile2 = day_profile1.iloc[ 90:-1, 0:: res] # subsample the database to improve the speed for demonstration purpose
import pandas as pd from user_feedback import Similarity from scipy.misc import comb from deep_metric_learning import Deep_Metric import numpy as np import pickle from linear_metric_learning import Linear_Metric from subsampling import Subsampling """ In the demo, we will showcase an example of special purpose publication. The data user wants the published database to maximally retain the information about lunch time. """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() def evaluation_occupancy_statistics(n, df_subsampled_from,day_profile): anonymity_level = n mode = "arrival" rep_mode = 'mean' subsample_size_max = int(comb(len(df_subsampled_from),2)) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None) # User receives the data pairs and label the similarity
def evaluation_total_usage(n): """ In the demo, we will showcase an example of special purpose publication. The data user wants the published energy database to maximally retain the information about peak-time energy usage """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() # step 1: get the database to be published day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl') day_profile = day_profile.fillna(0) day_profile = day_profile.iloc[ 0:90, 0:: 4] # subsample the database to improve the speed for demonstration purpose day_profile.index = range(len(day_profile.index)) rep_mode = 'mean' anonymity_level = n # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the # starting and ending time of the peak usage time interest = 'window-usage' window = [17, 21] sanitized_profile_best = util.sanitize_data( day_profile, distance_metric='self-defined', anonymity_level=anonymity_level, rep_mode=rep_mode, mode=interest, window=window) # step 3: pre-sanitize the database sanitized_profile_baseline = util.sanitize_data( day_profile, distance_metric='euclidean', anonymity_level=anonymity_level, rep_mode=rep_mode) loss_best_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_best, mode=interest, window=window) loss_generic_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_baseline, mode=interest, window=window) # print("information loss with learned metric %s" % loss_generic_metric) df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample( frac=1) subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % subsample_size_max) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size, seed=None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=interest, window=window) similarity_label, data_subsample = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 6: the original database is privatized using the learned metric sanitized_profile_deep = util.sanitize_data( day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm, window=window) sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm, window=window) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric_deep = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), mode=interest, window=window) loss_learned_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile, mode=interest, window=window) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
import pandas as pd from helper import Utilities util = Utilities() # step1: get original database to be published day_profile = pd.read_pickle('../dataset/dataframe_all_binary.pkl') # (optional) subsample the time series in each raw of the database res = 15 day_profile = day_profile.iloc[0::5, 0::res] # step2: specify the desired anonymity level anonymity_level = 5 # util.sanitize_data will privatize the database according to the desired anonymity level sanitized_profile = util.sanitize_data(day_profile, distance_metric='euclidean', anonymity_level=anonymity_level,rep_mode ='mean')
def __init__(self, set_name): self.SET_NAME = set_name self.co = Constants() self.io = DataFiles() self.u = Utilities()
class Preprocessor: def __init__(self, set_name): self.SET_NAME = set_name self.co = Constants() self.io = DataFiles() self.u = Utilities() def data_exists(self): filename = '{}.npz'.format(self.SET_NAME) return os.path.exists(os.path.join(self.co.DATA_ROOT, filename)) def path_hierarchy_exists(self): filename = 'timit_{}_path_hierarchy.json'.format(self.SET_NAME) return os.path.exists(os.path.join(self.co.DATA_ROOT, filename)) def path_hierarchy_with_features_exists(self): filename = 'timit_{}_samples.json'.format(self.SET_NAME) return os.path.exists(os.path.join(self.co.DATA_ROOT, filename)) def create_hierarchies(self): hierarchies = [] for timit_version in ['timit', 'ffmtimit']: if self.SET_NAME == 'test': self.tim = TIMIT(test=True, timit_version=timit_version) else: self.tim = TIMIT(timit_version=timit_version) print('Creating path hierarchy for {}...'.format(timit_version)) hierarchy = self.tim.create_paths_hierarchy() hierarchies.append(hierarchy) print('Merging the two') # Merges the TIMIT and FFMTIMIT hierarchies full_hierarchy = self.tim.merge_hierarchies(hierarchies[0], hierarchies[1]) print('Exporting path hierarchy...') hierarchy_filename = 'timit_{}_path_hierarchy.json'.format( self.SET_NAME) self.io.export_to_json_lines(full_hierarchy, hierarchy_filename) print('Creating samples and mspec features...') for accent in self.io.import_from_json_lines(hierarchy_filename): print('\tFor accent: ', accent['accent']) speakers = accent['speakers'] for speaker in speakers: speaker_sentences = speaker['sentences'] for sentence in speaker_sentences: audio = sentence['audio'] samples, samplingrate = self.u.loadAudio(audio) # power_spectrum = self.u.get_power_spectrum(samples) log_mspec = self.u.get_mspec(samples, samplingrate=samplingrate) # mspec = librosa.feature.melspectrogram(S=power_spectrum, # sr=samplingrate, # n_mels=64) # log_mspec = librosa.core.amplitude_to_db(mspec) samples_path = self.io.store_in_archive( samples, sentence, self.SET_NAME, 'samples') mspec_path = self.io.store_in_archive( log_mspec, sentence, self.SET_NAME, 'mspec') sentence["samples_path"] = samples_path sentence["audio_sr"] = samplingrate sentence["mspec_path"] = mspec_path print('\t\tAttempting to store entry') samples_filename = 'timit_{}_samples.json'.format(self.SET_NAME) self.io.export_entry_to_json_line(accent, samples_filename) print('\t\tStored') print('Hierarchy stored in {}'.format(hierarchy_filename)) def transform_data(self): ''' Reads an already created directory of paths and loads the specified mspec features. Returns the utterances and their accent ''' entries = self.io.import_from_json_lines( 'timit_{}_samples.json'.format(self.SET_NAME)) samples, targets = [], [] for entry in entries: accent = entry['accent'] for speaker in entry['speakers']: for sentence in speaker['sentences']: # The mspec features are 128*frames mspec_features = np.load(sentence['mspec_path']) # Transpose them to frames*128 samples.append(mspec_features.T) # Each accent is a string in dri where i =[1, 8] targets.append(accent) targets_int = self._targets_to_ints(targets) stored_into = self._store_data(samples, targets_int, targets) return stored_into def standardize_dataset(self): ''' Performs dataset-level standardization ''' data = np.load( os.path.join(self.co.DATA_ROOT, '{}.npz'.format(self.SET_NAME))) samples = data['X'] _, sentence_indexes_tuples = self._get_indexes_to_slice() data_stack = np.vstack(samples) scaler = StandardScaler() if self.SET_NAME == 'train': scaled = scaler.fit_transform(data_stack) self.TRAIN_MEAN = scaler.mean_ self.TRAIN_VAR = scaler.var_ filename = os.path.join(self.co.DATA_ROOT, 'training_set_meta.npz') np.savez(filename, mean=self.TRAIN_MEAN, var=self.TRAIN_VAR) elif self.SET_NAME == 'test': meta = np.load( os.path.join(self.co.DATA_ROOT, 'training_set_meta.npz')) self.TRAIN_MEAN = meta['mean'] self.TRAIN_VAR = meta['var'] scaled = np.divide(data_stack - self.TRAIN_MEAN, self.TRAIN_VAR) all_scaled_samples = [] for slice_tuple in sentence_indexes_tuples: slice_start = slice_tuple[0] slice_end = slice_tuple[1] sentence = scaled[slice_start:slice_end] all_scaled_samples.append(sentence.tolist()) stored_into = self._store_data(all_scaled_samples, data['Y'], data['Y_string'], 'dataset_scaled') return stored_into def standardize_speaker(self): ''' Performs speaker-lever standardization ''' data = np.load( os.path.join(self.co.DATA_ROOT, '{}.npz'.format(self.SET_NAME))) samples = data['X'] speaker_indexes_tuples, sentence_indexes_tuples = self._get_indexes_to_slice( ) all_scaled_samples = [] scaler = StandardScaler() for slice_tuple in speaker_indexes_tuples: slice_start = slice_tuple[0] slice_end = slice_tuple[1] # Slice the samples list, stack it to get 2d matrix speaker_slice = np.vstack(samples[slice_start:slice_end]) # Scaling scaled = scaler.fit_transform(speaker_slice) # Add to the list a list of the scaled samples all_scaled_samples.append(scaled) data_stack = np.vstack(all_scaled_samples) all_scaled_samples = [] for slice_tuple in sentence_indexes_tuples: slice_start = slice_tuple[0] slice_end = slice_tuple[1] sentence = data_stack[slice_start:slice_end] all_scaled_samples.append(sentence.tolist()) stored_into = self._store_data(all_scaled_samples, data['Y'], data['Y_string'], 'speaker_scaled') return stored_into def _get_indexes_to_slice(self): hierarchy_filename = 'timit_{}_samples.json'.format(self.SET_NAME) speaker_indexes_tuples = [] previous_speakers_sentences = 0 sentence_indexes_tuples = [] previous_sentences = 0 for accent in self.io.import_from_json_lines(hierarchy_filename): speakers = accent['speakers'] for speaker in speakers: speaker_sentences_num = len(speaker['sentences']) slice_start = previous_speakers_sentences slice_end = previous_speakers_sentences + speaker_sentences_num speaker_indexes_tuples.append((slice_start, slice_end)) previous_speakers_sentences += speaker_sentences_num for sentence in speaker['sentences']: mspec_features = np.load(sentence['mspec_path']) sentence_frames = np.shape(mspec_features.T)[0] slice_start = previous_sentences slice_end = previous_sentences + sentence_frames sentence_indexes_tuples.append((slice_start, slice_end)) previous_sentences += sentence_frames return speaker_indexes_tuples, sentence_indexes_tuples def _targets_to_ints(self, targets): ''' Turns a list of strings to a list of unique int ids ''' unique = np.unique(targets).tolist() out = [unique.index(t) for t in targets] return out def _store_data(self, samples, targets_int, targets, scaling_type=''): if scaling_type == '': scaling = '' else: scaling = scaling_type + '_' filename = os.path.join(self.co.DATA_ROOT, '{}{}.npz'.format(scaling, self.SET_NAME)) np.savez(filename, X=samples, Y=targets_int, Y_string=targets) return filename def add_noise(self, samples): ''' Adds noise drawn from a standard Gaussia to each sample samples: a list of frames*128 samples returns a samples list double the size of the arguement, with the noisy samples appended to the end of it ''' samples = samples.tolist() for i in range(len(samples)): noise = np.random.standard_normal(np.shape(samples[i])) samples.append(samples[i] + noise) return samples
def __init__(self): self.co = Constants() self.u = Utilities()
class DataFiles: def __init__(self): self.co = Constants() self.u = Utilities() def export_to_json_lines(self, hierarchy, filename, indent=None): if not filename.endswith('.json'): raise ValueError('Filename does not end in .json') with open(os.path.join(self.co.DATA_ROOT, filename), 'w') as f: for line in hierarchy: json_s = json.dumps(line, indent=indent) f.write(json_s) f.write('\n') def import_from_json_lines(self, filename): if not filename.endswith('.json'): raise ValueError('Filename does not end in .json') with open(os.path.join(self.co.DATA_ROOT, filename), 'r') as f: for line in f: yield (json.loads(line)) def export_entry_to_json_line(self, entry, filename, indent=None): # if not os.path.isfile(filename): # with open(os.path.join(self.co.DATA_ROOT, filename), 'w') as f: # log_feed = [] # json_s = json.dumps(log_feed, indent) # f.write(json_s) # with open(os.path.join(self.co.DATA_ROOT, filename), 'r') as f: # log_feed = json.load(f) # log_feed.append(entry) with open(os.path.join(self.co.DATA_ROOT, filename), 'a') as f: json_s = json.dumps(entry, indent=indent) f.write(json_s) f.write('\n') def store_in_archive(self, arr, sentence, set_name, arr_type): if type(arr) != np.ndarray: raise ValueError(('Can only store numpy.ndarray.\ Was passed: ', type(arr))) accent, gender, speaker_id, text_type, sentence_number = \ self.u.path2info(sentence['audio']) filename = os.path.join('{}_{}{}.npy'.format(arr_type, text_type, sentence_number)) root = self.create_or_return( (self.co.DATA_ROOT, set_name, arr_type, accent, '{}{}'.format(gender, speaker_id))) file = os.path.join(root, filename) np.save(file, arr) return file def create_or_return(self, path_parts_tuple): path = path_parts_tuple[0] for part in path_parts_tuple[1:]: path = os.path.join(path, part) if not os.path.exists(path): os.mkdir(path) return path
class TIMIT: def __init__(self, test=False, timit_version=None): co = Constants() self.u = Utilities() if timit_version: timit_version = timit_version.lower() if timit_version not in ['timit', 'ffmtimit']: raise ValueError('Bad timit version') if timit_version == 'timit': if test: self.SET_ROOT = co.TIMIT_TEST_ROOT else: self.SET_ROOT = co.TIMIT_TRAIN_ROOT elif timit_version == 'ffmtimit': if test: self.SET_ROOT = co.FFMTIMIT_TEST_ROOT else: self.SET_ROOT = co.FFMTIMIT_TRAIN_ROOT def _get_accent_paths(self): ''' Returns a list containing the paths to the accent directories in the training set, from the TIMIT_ROOT ''' return [ os.path.join(self.SET_ROOT, accent_dir) for accent_dir in os.listdir(self.SET_ROOT) ] def _get_speaker_files(self, accent): ''' accent: path to accent directory in timit Returns list of paths to each speaker in the given accent directory ''' return [ os.path.join(accent, speaker_dir) for speaker_dir in os.listdir(accent) ] def _get_utterance_files(self, speaker): ''' speaker: path to a speaker directory in timit Returns list of tuples Each tuple contains four paths, each path corresponds to the .wav, .txt, .wrd and .phn files for each sentence the speaker has uttered ''' files = [ os.path.join(speaker, file_type) for file_type in os.listdir(speaker) ] speaker_files = [] count = 0 for end in range(4, len(files), 4): start = count slice = files[start:end] speaker_files.append(tuple(slice)) count += len(slice) return speaker_files def create_paths_hierarchy(self): ''' Returns a list of accent dictionaries Each accent dictionary contains the name of the accent, the path to its directry and a list of speakers for the accent The list of speakers contains the speaker id, the gender, and a list of sentences the speaker has uttered The list of sentences contains the text type, the sentence number, and the paths to the phonetic, text and word transcriptions as well as the path to the audio file of the recording ''' accent_paths = self._get_accent_paths() accents_list = [] for accent_path in accent_paths: speaker_directories = self._get_speaker_files(accent_path) speakers_list = [] for speaker in speaker_directories: speaker_files = self._get_utterance_files(speaker) speaker_dir = self._make_speaker_dic(speaker_files) speakers_list.append(speaker_dir) accent_dir = self._make_accent_dic(accent_path, speakers_list) accents_list.append(accent_dir) return accents_list def _make_accent_dic(self, accent_path, speakers_list): ''' Creates the dictionary with info for each accent ''' accent = accent_path[-3:] return { "accent": accent, "accent_path": accent_path, "speakers": speakers_list } def _make_speaker_dic(self, speaker_files): ''' Creates the dictionary with info for each speaker ''' _, gender, speaker_id, _, _ = self.u.path2info(speaker_files[0][0]) sentences_list = self._make_speaker_sentences_dic(speaker_files) return { "speaker_id": speaker_id, "gender": gender, "sentences": sentences_list } def _make_speaker_sentences_dic(self, speaker_files): ''' Creates the dictionary with info about the sentences for each speaker ''' sentences = [] for files in speaker_files: phoneme_transcription, text_transcription, audio, word_transcription = '', '', '', '' for file in files: if file.endswith('.phn'): phoneme_transcription = file elif file.endswith('.txt'): text_transcription = file elif file.endswith('.wav'): audio = file elif file.endswith('.wrd'): word_transcription = file else: raise ValueError('File of uknown type encountered: ', file) _, _, _, text_type, sentence_number = self.u.path2info(files[0]) sentence_dir = { "text_type": text_type, "number": sentence_number, "phoneme_transcription": phoneme_transcription, "text_transcription": text_transcription, "audio": audio, "word_transcription": word_transcription } sentences.append(sentence_dir) return sentences def _is_subset(self, big, small): # Returns whether big contains all of small return all([el in big for el in small]) def _return_ordered(self, lst1, lst2): ''' Returns first the longer, and second the shorter list ''' if len(lst1) <= len(lst2): return lst2, lst1 return lst1, lst2 def _merge_lists(self, lst1, lst2): return lst1 + lst2 def merge_hierarchies(self, h1, h2): assert (len(h1) == len(h2)) new_hierarchy = [] for a in range(len(h1)): assert h1[a]['accent'] == h2[a]['accent'] h1_speakers = h1[a]['speakers'] h2_speakers = h2[a]['speakers'] longer, shorter = self._return_ordered(h1_speakers, h2_speakers) new_speakers = [] for speaker_dict in longer: speaker_id = speaker_dict['speaker_id'] for speaker_dict_2 in shorter: speaker_id_2 = speaker_dict_2['speaker_id'] merged_sentences = None if speaker_id_2 == speaker_id: merged_sentences = self._merge_lists( speaker_dict['sentences'], speaker_dict_2['sentences']) continue new_speaker = speaker_dict if merged_sentences: new_speaker['sentences'] = merged_sentences else: new_speaker['sentences'] = speaker_dict['sentences'] new_speakers.append(new_speaker) new_accent = h1[a] new_accent['speakers'] = new_speakers new_hierarchy.append(new_accent) return new_hierarchy