# path, channel, name, noise_type, duration return { key: np.array(sorted(val, key=lambda x: x[0])) for key, val in all_files.items() } # ==================== run the validation ==================== # if CURRENT_STATE == SystemStates.EXTRACT_FEATURES: ALL_NOISE = validating_noise_data(in_path_raw=PATH_RAW_DATA) print("Processed noise data:") for ds_name, noise_list in ALL_NOISE.items(): print(" ", ctext(ds_name, 'yellow'), ':', noise_list.shape) if len(noise_list) == 0: continue for name, count in sorted(freqcount(noise_list[:, 3]).items(), key=lambda x: x[0]): print(' ', ctext('%-10s' % name, 'yellow'), ':', '%s(files)' % ctext('%-6d' % count, 'cyan')) # =========================================================================== # Validating the file list of training data # =========================================================================== @cache_disk def validating_training_data(in_path_raw, training_dataset): file_list = { ds: sre_file_list[ds] for ds in training_dataset if ds in sre_file_list } # ====== meta info ====== #
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1234 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError( 'Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get( tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
for name, (start, end) in indices: assert end - start > 0 if name.split('_')[0] == 'train': train.append((name, (start, end))) else: test.append((name, (start, end))) max_length = max(end - start, max_length) min_length = min(end - start, min_length) print(ctext("#Train:", 'yellow'), len(train), train[:2]) print(ctext("#Test:", 'yellow'), len(test), test[:2]) print("Min Length:", ctext(min_length, 'cyan')) print("Max Length:", ctext(max_length, 'cyan')) # ====== gender and single digit distribution ====== # gender_digit = lambda x: x[0].split('_')[1] + '-' + x[0].split('_')[-1] print( print_dist(d=freqcount(train, key=gender_digit), show_number=True, title="Training distribution")) print( print_dist(d=freqcount(test, key=gender_digit), show_number=True, title="Testing distribution")) # ====== digits ====== # f_digits, digits = unique_labels([i[0] for i in train + test], key_func=lambda x: x.split('_')[-1], return_labels=True) print(ctext("All digits:", 'yellow'), ctext(digits, 'cyan')) # ====== genders ====== # f_genders, genders = unique_labels([i[0] for i in train + test], key_func=lambda x: x.split('_')[1], return_labels=True)
nb_classes = 10 # 10 digits (0-9) # =========================================================================== # Create feeder # =========================================================================== indices = [(name, start, end) for name, (start, end) in ds['indices']] longest_utterances = max( int(end) - int(start) - 1 for i, start, end in indices) longest_vad = max(end - start for name, vad in ds['vadids'] for (start, end) in vad) print("Longest Utterance:", longest_utterances) print("Longest Vad:", longest_vad) np.random.shuffle(indices) train, valid, test = train_valid_test_split(indices, train=0.6, inc_test=True) print('Nb train:', len(train), freqcount([int(i[0][0]) for i in train])) print('Nb valid:', len(valid), freqcount([int(i[0][0]) for i in valid])) print('Nb test:', len(test), freqcount([int(i[0][0]) for i in test])) train_feeder = F.Feeder(ds['mspec'], train, ncpu=1) test_feeder = F.Feeder(ds['mspec'], test, ncpu=2) valid_feeder = F.Feeder(ds['mspec'], valid, ncpu=2) recipes = [ F.recipes.Name2Trans(converter_func=lambda x: int(x[0])), F.recipes.Normalization(mean=ds['mspec_mean'], std=ds['mspec_std'], local_normalize=False), F.recipes.Sequencing(frame_length=longest_utterances, hop_length=1, end='pad',
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1208 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError('Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
((name, start, end) for name, (start, end) in ds['indices'].iteritems()), key=lambda x: x[0]) all_labels = list(set(i[0].split("_")[0] for i in indices)) print("Labels:", all_labels) np.random.shuffle(indices) np.random.shuffle(indices) longest_utterance = max(int(end - start) for name, start, end in indices) print("Longest Utterance:", longest_utterance) nb_files = len(indices) train_indices = indices[:int(0.6 * nb_files)] valid_indices = indices[int(0.6 * nb_files):int(0.8 * nb_files)] test_indices = indices[int(0.8 * nb_files):] print("Train distribution:", len(train_indices), freqcount([x[0].split('_')[0] for x in train_indices]).items()) print("Valid distribution:", len(valid_indices), freqcount([x[0].split('_')[0] for x in valid_indices]).items()) print("Test distribution:", len(test_indices), freqcount([x[0].split('_')[0] for x in test_indices]).items()) train = F.Feeder(ds[FEAT], train_indices, ncpu=1) valid = F.Feeder(ds[FEAT], valid_indices, ncpu=1) test = F.Feeder(ds[FEAT], test_indices, ncpu=1) recipes = [ F.recipes.Name2Trans( converter_func=lambda x: all_labels.index(x.split("_")[0])), F.recipes.Normalization(mean=ds[FEAT + "_mean"], std=ds[FEAT + "_std"], local_normalize=False),
# ====== return ====== # # Header: # 0 1 2 3 4 # path, channel, name, noise_type, duration return {key: np.array(sorted(val, key=lambda x: x[0])) for key, val in all_files.items()} # ==================== run the validation ==================== # if CURRENT_STATE == SystemStates.EXTRACT_FEATURES: ALL_NOISE = validating_noise_data( in_path_raw=PATH_RAW_DATA) print("Processed noise data:") for ds_name, noise_list in ALL_NOISE.items(): print(" ", ctext(ds_name, 'yellow'), ':', noise_list.shape) if len(noise_list) == 0: continue for name, count in sorted(freqcount(noise_list[:, 3]).items(), key=lambda x: x[0]): print(' ', ctext('%-10s' % name, 'yellow'), ':', '%s(files)' % ctext('%-6d' % count, 'cyan')) # =========================================================================== # Validating the file list of training data # =========================================================================== @cache_disk def validating_training_data(in_path_raw, training_dataset): file_list = {ds: sre_file_list[ds] for ds in training_dataset if ds in sre_file_list} # ====== meta info ====== # all_files = [] non_exist_files = [] extension_count = defaultdict(int)