Пример #1
0
    def __init__(self, test=False, timit_version=None):
        co = Constants()
        self.u = Utilities()

        if timit_version:
            timit_version = timit_version.lower()
            if timit_version not in ['timit', 'ffmtimit']:
                raise ValueError('Bad timit version')

            if timit_version == 'timit':
                if test:
                    self.SET_ROOT = co.TIMIT_TEST_ROOT
                else:
                    self.SET_ROOT = co.TIMIT_TRAIN_ROOT

            elif timit_version == 'ffmtimit':
                if test:
                    self.SET_ROOT = co.FFMTIMIT_TEST_ROOT
                else:
                    self.SET_ROOT = co.FFMTIMIT_TRAIN_ROOT
from helper import Utilities, PerformanceEvaluation
import pandas as pd
import numpy as np
from metric_learning import MetricLearning, Subsampling
from user_feedback import Similarity
import pickle
from scipy.misc import comb
import time
"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published energy database to maximally retain the information about peak-time energy usage
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()
mel = MetricLearning()

# step 1: get the database to be published
day_profile = pd.read_pickle('../dataset/dataframe_all_energy.pkl')
day_profile = day_profile.iloc[
    0:90, 0::
    4]  # subsample the database to improve the speed for demonstration purpose
day_profile.index = range(len(day_profile.index))
rep_mode = 'mean'
anonymity_level = 2  # desired anonymity level

# step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
# information of the cumulative energy use during peak time. In this case, he/she would also need to specify the
# starting and ending time of the peak usage time
interest = 'window-usage'
Пример #3
0
from subsampling import Subsampling
from user_feedback import Similarity
from scipy.misc import comb
from deep_metric_learning import Deep_Metric
import numpy as np
import pickle
from linear_metric_learning import Linear_Metric
import pdb

# """
# In the demo, we will showcase an example of special purpose publication.
# The data user wants the published database to maximally retain the information about lunch time.
# """

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()

day_profile_all = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
day_profile = day_profile_all.iloc[0:90, 0::60]  # the database to be published
day_profile_metric_learn = day_profile_all.iloc[
    90:-1, 0::60]  # the database for learning distance metric
day_profile.dropna()
day_profile_metric_learn.dropna()

rep_mode = 'mean'
anonymity_level = 2  # desired anonymity level

# step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
# information of a segment of entire time series. In this case, he/she would also need to specify the starting and
# ending time of the time series segment of interest.
Пример #4
0
from helper import Utilities, PerformanceEvaluation
import pandas as pd
from user_feedback import Similarity
from scipy.misc import comb
from deep_metric_learning import Deep_Metric
import numpy as np
import pickle
from linear_metric_learning import Linear_Metric
from subsampling import Subsampling
"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published database to maximally retain the information about lunch time.
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()


def evaluation_occupancy_statistics(n, mode="arrival"):
    day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
    day_profile1 = day_profile1.fillna(0)
    day_profile1[day_profile1 > 0] = 1

    res = 15

    day_profile = day_profile1.iloc[:90, 0::
                                    res]  # subsample the database to improve the speed for demonstration purpose
    day_profile2 = day_profile1.iloc[
        90:-1, 0::
        res]  # subsample the database to improve the speed for demonstration purpose
Пример #5
0
import pandas as pd
from user_feedback import Similarity
from scipy.misc import comb
from deep_metric_learning import Deep_Metric
import numpy as np
import pickle
from linear_metric_learning import Linear_Metric
from subsampling import Subsampling

"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published database to maximally retain the information about lunch time.
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()

def evaluation_occupancy_statistics(n, df_subsampled_from,day_profile):

    anonymity_level = n
    mode = "arrival"
    rep_mode = 'mean'
    subsample_size_max = int(comb(len(df_subsampled_from),2))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None)

    # User receives the data pairs and label the similarity
Пример #6
0
def evaluation_total_usage(n):
    """
    In the demo, we will showcase an example of special purpose publication.
    The data user wants the published energy database to maximally retain the information about peak-time energy usage
    """

    # Initialization of some useful classes
    util = Utilities()
    pe = PerformanceEvaluation()

    # step 1: get the database to be published
    day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl')
    day_profile = day_profile.fillna(0)
    day_profile = day_profile.iloc[
        0:90, 0::
        4]  # subsample the database to improve the speed for demonstration purpose
    day_profile.index = range(len(day_profile.index))
    rep_mode = 'mean'
    anonymity_level = n  # desired anonymity level

    # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
    # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the
    # starting and ending time of the peak usage time
    interest = 'window-usage'
    window = [17, 21]

    sanitized_profile_best = util.sanitize_data(
        day_profile,
        distance_metric='self-defined',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        mode=interest,
        window=window)

    # step 3: pre-sanitize the database
    sanitized_profile_baseline = util.sanitize_data(
        day_profile,
        distance_metric='euclidean',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode)

    loss_best_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_best,
        mode=interest,
        window=window)

    loss_generic_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_baseline,
        mode=interest,
        window=window)
    # print("information loss with learned metric %s" % loss_generic_metric)

    df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample(
        frac=1)
    subsample_size_max = int(comb(len(df_subsampled_from), 2))

    print('total number of pairs is %s' % subsample_size_max)

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics',
                                     stat_type=interest,
                                     window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm,
        window=window)

    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm,
                                           window=window)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=interest,
        window=window)

    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile,
        mode=interest,
        window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size
import pandas as pd
from helper import Utilities

util = Utilities()

# step1: get original database to be published
day_profile = pd.read_pickle('../dataset/dataframe_all_binary.pkl')

# (optional) subsample the time series in each raw of the database
res = 15
day_profile = day_profile.iloc[0::5, 0::res]

# step2: specify the desired anonymity level
anonymity_level = 5

# util.sanitize_data will privatize the database according to the desired anonymity level
sanitized_profile = util.sanitize_data(day_profile, distance_metric='euclidean',
                                       anonymity_level=anonymity_level,rep_mode ='mean')


 def __init__(self, set_name):
     self.SET_NAME = set_name
     self.co = Constants()
     self.io = DataFiles()
     self.u = Utilities()
class Preprocessor:
    def __init__(self, set_name):
        self.SET_NAME = set_name
        self.co = Constants()
        self.io = DataFiles()
        self.u = Utilities()

    def data_exists(self):
        filename = '{}.npz'.format(self.SET_NAME)
        return os.path.exists(os.path.join(self.co.DATA_ROOT, filename))

    def path_hierarchy_exists(self):
        filename = 'timit_{}_path_hierarchy.json'.format(self.SET_NAME)
        return os.path.exists(os.path.join(self.co.DATA_ROOT, filename))

    def path_hierarchy_with_features_exists(self):
        filename = 'timit_{}_samples.json'.format(self.SET_NAME)
        return os.path.exists(os.path.join(self.co.DATA_ROOT, filename))

    def create_hierarchies(self):

        hierarchies = []
        for timit_version in ['timit', 'ffmtimit']:
            if self.SET_NAME == 'test':
                self.tim = TIMIT(test=True, timit_version=timit_version)
            else:
                self.tim = TIMIT(timit_version=timit_version)

            print('Creating path hierarchy for {}...'.format(timit_version))
            hierarchy = self.tim.create_paths_hierarchy()
            hierarchies.append(hierarchy)

        print('Merging the two')
        # Merges the TIMIT and FFMTIMIT hierarchies
        full_hierarchy = self.tim.merge_hierarchies(hierarchies[0],
                                                    hierarchies[1])

        print('Exporting path hierarchy...')
        hierarchy_filename = 'timit_{}_path_hierarchy.json'.format(
            self.SET_NAME)
        self.io.export_to_json_lines(full_hierarchy, hierarchy_filename)

        print('Creating samples and mspec features...')
        for accent in self.io.import_from_json_lines(hierarchy_filename):
            print('\tFor accent: ', accent['accent'])
            speakers = accent['speakers']

            for speaker in speakers:
                speaker_sentences = speaker['sentences']

                for sentence in speaker_sentences:
                    audio = sentence['audio']

                    samples, samplingrate = self.u.loadAudio(audio)
                    # power_spectrum = self.u.get_power_spectrum(samples)
                    log_mspec = self.u.get_mspec(samples,
                                                 samplingrate=samplingrate)
                    # mspec = librosa.feature.melspectrogram(S=power_spectrum,
                    #                                        sr=samplingrate,
                    #                                        n_mels=64)
                    # log_mspec = librosa.core.amplitude_to_db(mspec)

                    samples_path = self.io.store_in_archive(
                        samples, sentence, self.SET_NAME, 'samples')
                    mspec_path = self.io.store_in_archive(
                        log_mspec, sentence, self.SET_NAME, 'mspec')

                    sentence["samples_path"] = samples_path
                    sentence["audio_sr"] = samplingrate
                    sentence["mspec_path"] = mspec_path

            print('\t\tAttempting to store entry')
            samples_filename = 'timit_{}_samples.json'.format(self.SET_NAME)
            self.io.export_entry_to_json_line(accent, samples_filename)
            print('\t\tStored')

        print('Hierarchy stored in {}'.format(hierarchy_filename))

    def transform_data(self):
        '''
        Reads an already created directory of paths
        and loads the specified mspec features.
        Returns the utterances and their accent
        '''
        entries = self.io.import_from_json_lines(
            'timit_{}_samples.json'.format(self.SET_NAME))

        samples, targets = [], []

        for entry in entries:
            accent = entry['accent']
            for speaker in entry['speakers']:
                for sentence in speaker['sentences']:
                    # The mspec features are 128*frames
                    mspec_features = np.load(sentence['mspec_path'])
                    # Transpose them to frames*128
                    samples.append(mspec_features.T)
                    # Each accent is a string in dri where i =[1, 8]
                    targets.append(accent)

        targets_int = self._targets_to_ints(targets)

        stored_into = self._store_data(samples, targets_int, targets)

        return stored_into

    def standardize_dataset(self):
        '''
        Performs dataset-level standardization
        '''
        data = np.load(
            os.path.join(self.co.DATA_ROOT, '{}.npz'.format(self.SET_NAME)))
        samples = data['X']

        _, sentence_indexes_tuples = self._get_indexes_to_slice()
        data_stack = np.vstack(samples)

        scaler = StandardScaler()

        if self.SET_NAME == 'train':
            scaled = scaler.fit_transform(data_stack)

            self.TRAIN_MEAN = scaler.mean_
            self.TRAIN_VAR = scaler.var_

            filename = os.path.join(self.co.DATA_ROOT, 'training_set_meta.npz')
            np.savez(filename, mean=self.TRAIN_MEAN, var=self.TRAIN_VAR)

        elif self.SET_NAME == 'test':
            meta = np.load(
                os.path.join(self.co.DATA_ROOT, 'training_set_meta.npz'))
            self.TRAIN_MEAN = meta['mean']
            self.TRAIN_VAR = meta['var']

            scaled = np.divide(data_stack - self.TRAIN_MEAN, self.TRAIN_VAR)

        all_scaled_samples = []

        for slice_tuple in sentence_indexes_tuples:
            slice_start = slice_tuple[0]
            slice_end = slice_tuple[1]

            sentence = scaled[slice_start:slice_end]

            all_scaled_samples.append(sentence.tolist())

        stored_into = self._store_data(all_scaled_samples, data['Y'],
                                       data['Y_string'], 'dataset_scaled')

        return stored_into

    def standardize_speaker(self):
        '''
        Performs speaker-lever standardization
        '''
        data = np.load(
            os.path.join(self.co.DATA_ROOT, '{}.npz'.format(self.SET_NAME)))
        samples = data['X']

        speaker_indexes_tuples, sentence_indexes_tuples = self._get_indexes_to_slice(
        )

        all_scaled_samples = []
        scaler = StandardScaler()

        for slice_tuple in speaker_indexes_tuples:
            slice_start = slice_tuple[0]
            slice_end = slice_tuple[1]

            # Slice the samples list, stack it to get 2d matrix
            speaker_slice = np.vstack(samples[slice_start:slice_end])
            # Scaling
            scaled = scaler.fit_transform(speaker_slice)

            # Add to the list a list of the scaled samples
            all_scaled_samples.append(scaled)

        data_stack = np.vstack(all_scaled_samples)

        all_scaled_samples = []

        for slice_tuple in sentence_indexes_tuples:
            slice_start = slice_tuple[0]
            slice_end = slice_tuple[1]

            sentence = data_stack[slice_start:slice_end]

            all_scaled_samples.append(sentence.tolist())

        stored_into = self._store_data(all_scaled_samples, data['Y'],
                                       data['Y_string'], 'speaker_scaled')

        return stored_into

    def _get_indexes_to_slice(self):
        hierarchy_filename = 'timit_{}_samples.json'.format(self.SET_NAME)

        speaker_indexes_tuples = []
        previous_speakers_sentences = 0

        sentence_indexes_tuples = []
        previous_sentences = 0

        for accent in self.io.import_from_json_lines(hierarchy_filename):
            speakers = accent['speakers']

            for speaker in speakers:
                speaker_sentences_num = len(speaker['sentences'])

                slice_start = previous_speakers_sentences
                slice_end = previous_speakers_sentences + speaker_sentences_num

                speaker_indexes_tuples.append((slice_start, slice_end))

                previous_speakers_sentences += speaker_sentences_num

                for sentence in speaker['sentences']:
                    mspec_features = np.load(sentence['mspec_path'])

                    sentence_frames = np.shape(mspec_features.T)[0]
                    slice_start = previous_sentences
                    slice_end = previous_sentences + sentence_frames

                    sentence_indexes_tuples.append((slice_start, slice_end))
                    previous_sentences += sentence_frames

        return speaker_indexes_tuples, sentence_indexes_tuples

    def _targets_to_ints(self, targets):
        '''
        Turns a list of strings to a list of unique int ids
        '''
        unique = np.unique(targets).tolist()
        out = [unique.index(t) for t in targets]

        return out

    def _store_data(self, samples, targets_int, targets, scaling_type=''):
        if scaling_type == '':
            scaling = ''
        else:
            scaling = scaling_type + '_'

        filename = os.path.join(self.co.DATA_ROOT,
                                '{}{}.npz'.format(scaling, self.SET_NAME))

        np.savez(filename, X=samples, Y=targets_int, Y_string=targets)

        return filename

    def add_noise(self, samples):
        '''
        Adds noise drawn from a standard Gaussia to each sample
        samples: a list of frames*128 samples
        
        returns a samples list double the size of the arguement,
        with the noisy samples appended to the end of it
        '''

        samples = samples.tolist()
        for i in range(len(samples)):
            noise = np.random.standard_normal(np.shape(samples[i]))
            samples.append(samples[i] + noise)

        return samples
Пример #10
0
 def __init__(self):
     self.co = Constants()
     self.u = Utilities()
Пример #11
0
class DataFiles:
    def __init__(self):
        self.co = Constants()
        self.u = Utilities()

    def export_to_json_lines(self, hierarchy, filename, indent=None):
        if not filename.endswith('.json'):
            raise ValueError('Filename does not end in .json')

        with open(os.path.join(self.co.DATA_ROOT, filename), 'w') as f:
            for line in hierarchy:
                json_s = json.dumps(line, indent=indent)
                f.write(json_s)
                f.write('\n')

    def import_from_json_lines(self, filename):
        if not filename.endswith('.json'):
            raise ValueError('Filename does not end in .json')

        with open(os.path.join(self.co.DATA_ROOT, filename), 'r') as f:
            for line in f:
                yield (json.loads(line))

    def export_entry_to_json_line(self, entry, filename, indent=None):
        # if not os.path.isfile(filename):
        #     with open(os.path.join(self.co.DATA_ROOT, filename), 'w') as f:
        #         log_feed = []
        #         json_s = json.dumps(log_feed, indent)
        #         f.write(json_s)

        # with open(os.path.join(self.co.DATA_ROOT, filename), 'r') as f:
        #     log_feed = json.load(f)
        #     log_feed.append(entry)

        with open(os.path.join(self.co.DATA_ROOT, filename), 'a') as f:
            json_s = json.dumps(entry, indent=indent)
            f.write(json_s)
            f.write('\n')

    def store_in_archive(self, arr, sentence, set_name, arr_type):
        if type(arr) != np.ndarray:
            raise ValueError(('Can only store numpy.ndarray.\
                                Was passed: ', type(arr)))

        accent, gender, speaker_id, text_type, sentence_number = \
            self.u.path2info(sentence['audio'])

        filename = os.path.join('{}_{}{}.npy'.format(arr_type, text_type,
                                                     sentence_number))

        root = self.create_or_return(
            (self.co.DATA_ROOT, set_name, arr_type, accent,
             '{}{}'.format(gender, speaker_id)))

        file = os.path.join(root, filename)
        np.save(file, arr)

        return file

    def create_or_return(self, path_parts_tuple):
        path = path_parts_tuple[0]
        for part in path_parts_tuple[1:]:
            path = os.path.join(path, part)
            if not os.path.exists(path):
                os.mkdir(path)

        return path
Пример #12
0
class TIMIT:
    def __init__(self, test=False, timit_version=None):
        co = Constants()
        self.u = Utilities()

        if timit_version:
            timit_version = timit_version.lower()
            if timit_version not in ['timit', 'ffmtimit']:
                raise ValueError('Bad timit version')

            if timit_version == 'timit':
                if test:
                    self.SET_ROOT = co.TIMIT_TEST_ROOT
                else:
                    self.SET_ROOT = co.TIMIT_TRAIN_ROOT

            elif timit_version == 'ffmtimit':
                if test:
                    self.SET_ROOT = co.FFMTIMIT_TEST_ROOT
                else:
                    self.SET_ROOT = co.FFMTIMIT_TRAIN_ROOT

    def _get_accent_paths(self):
        '''
        Returns a list containing the paths to the accent directories
        in the training set, from the TIMIT_ROOT
        '''
        return [
            os.path.join(self.SET_ROOT, accent_dir)
            for accent_dir in os.listdir(self.SET_ROOT)
        ]

    def _get_speaker_files(self, accent):
        '''
        accent: path to accent directory in timit
        Returns list of paths to each speaker in the given accent directory
        '''
        return [
            os.path.join(accent, speaker_dir)
            for speaker_dir in os.listdir(accent)
        ]

    def _get_utterance_files(self, speaker):
        '''
        speaker: path to a speaker directory in timit
        Returns list of tuples
        Each tuple contains four paths, each path corresponds to the .wav,
        .txt, .wrd and .phn files for each sentence the speaker has uttered
        '''

        files = [
            os.path.join(speaker, file_type)
            for file_type in os.listdir(speaker)
        ]

        speaker_files = []
        count = 0
        for end in range(4, len(files), 4):
            start = count
            slice = files[start:end]
            speaker_files.append(tuple(slice))

            count += len(slice)

        return speaker_files

    def create_paths_hierarchy(self):
        '''
        Returns a list of accent dictionaries
        Each accent dictionary contains the name of the accent, the path
            to its directry and a list of speakers for the accent
        The list of speakers contains the speaker id, the gender,
            and a list of sentences the speaker has uttered
        The list of sentences contains the text type, the sentence number,
            and the paths to the phonetic, text and word transcriptions
            as well as the path to the audio file of the recording
        '''
        accent_paths = self._get_accent_paths()

        accents_list = []
        for accent_path in accent_paths:

            speaker_directories = self._get_speaker_files(accent_path)

            speakers_list = []
            for speaker in speaker_directories:
                speaker_files = self._get_utterance_files(speaker)

                speaker_dir = self._make_speaker_dic(speaker_files)

                speakers_list.append(speaker_dir)

            accent_dir = self._make_accent_dic(accent_path, speakers_list)

            accents_list.append(accent_dir)

        return accents_list

    def _make_accent_dic(self, accent_path, speakers_list):
        '''
        Creates the dictionary with info for each accent
        '''
        accent = accent_path[-3:]

        return {
            "accent": accent,
            "accent_path": accent_path,
            "speakers": speakers_list
        }

    def _make_speaker_dic(self, speaker_files):
        '''
        Creates the dictionary with info for each speaker
        '''
        _, gender, speaker_id, _, _ = self.u.path2info(speaker_files[0][0])

        sentences_list = self._make_speaker_sentences_dic(speaker_files)

        return {
            "speaker_id": speaker_id,
            "gender": gender,
            "sentences": sentences_list
        }

    def _make_speaker_sentences_dic(self, speaker_files):
        '''
        Creates the dictionary with info about the sentences for each speaker
        '''
        sentences = []
        for files in speaker_files:
            phoneme_transcription, text_transcription, audio, word_transcription = '', '', '', ''
            for file in files:
                if file.endswith('.phn'):
                    phoneme_transcription = file
                elif file.endswith('.txt'):
                    text_transcription = file
                elif file.endswith('.wav'):
                    audio = file
                elif file.endswith('.wrd'):
                    word_transcription = file
                else:
                    raise ValueError('File of uknown type encountered: ', file)

            _, _, _, text_type, sentence_number = self.u.path2info(files[0])

            sentence_dir = {
                "text_type": text_type,
                "number": sentence_number,
                "phoneme_transcription": phoneme_transcription,
                "text_transcription": text_transcription,
                "audio": audio,
                "word_transcription": word_transcription
            }

            sentences.append(sentence_dir)

        return sentences

    def _is_subset(self, big, small):
        # Returns whether big contains all of small
        return all([el in big for el in small])

    def _return_ordered(self, lst1, lst2):
        '''
        Returns first the longer, and second the shorter list
        '''

        if len(lst1) <= len(lst2):
            return lst2, lst1

        return lst1, lst2

    def _merge_lists(self, lst1, lst2):
        return lst1 + lst2

    def merge_hierarchies(self, h1, h2):
        assert (len(h1) == len(h2))

        new_hierarchy = []
        for a in range(len(h1)):
            assert h1[a]['accent'] == h2[a]['accent']

            h1_speakers = h1[a]['speakers']
            h2_speakers = h2[a]['speakers']

            longer, shorter = self._return_ordered(h1_speakers, h2_speakers)

            new_speakers = []
            for speaker_dict in longer:
                speaker_id = speaker_dict['speaker_id']

                for speaker_dict_2 in shorter:
                    speaker_id_2 = speaker_dict_2['speaker_id']

                    merged_sentences = None
                    if speaker_id_2 == speaker_id:
                        merged_sentences = self._merge_lists(
                            speaker_dict['sentences'],
                            speaker_dict_2['sentences'])
                        continue

                new_speaker = speaker_dict
                if merged_sentences:
                    new_speaker['sentences'] = merged_sentences
                else:
                    new_speaker['sentences'] = speaker_dict['sentences']

                new_speakers.append(new_speaker)

            new_accent = h1[a]
            new_accent['speakers'] = new_speakers
            new_hierarchy.append(new_accent)

        return new_hierarchy