Пример #1
0
    # path, channel, name, noise_type, duration
    return {
        key: np.array(sorted(val, key=lambda x: x[0]))
        for key, val in all_files.items()
    }


# ==================== run the validation ==================== #
if CURRENT_STATE == SystemStates.EXTRACT_FEATURES:
    ALL_NOISE = validating_noise_data(in_path_raw=PATH_RAW_DATA)
    print("Processed noise data:")
    for ds_name, noise_list in ALL_NOISE.items():
        print(" ", ctext(ds_name, 'yellow'), ':', noise_list.shape)
        if len(noise_list) == 0:
            continue
        for name, count in sorted(freqcount(noise_list[:, 3]).items(),
                                  key=lambda x: x[0]):
            print('  ', ctext('%-10s' % name, 'yellow'), ':',
                  '%s(files)' % ctext('%-6d' % count, 'cyan'))


# ===========================================================================
# Validating the file list of training data
# ===========================================================================
@cache_disk
def validating_training_data(in_path_raw, training_dataset):
    file_list = {
        ds: sre_file_list[ds]
        for ds in training_dataset if ds in sre_file_list
    }
    # ====== meta info ====== #
Пример #2
0
 def transform(self,
               texts,
               mode='seq',
               dtype='int32',
               padding='pre',
               truncating='pre',
               value=0.,
               end_document=None,
               maxlen=None,
               token_not_found='ignore'):
     """
 Parameters
 ----------
 texts: iterator of unicode
     iterator, generator or list (e.g. [u'a', u'b', ...])
     of unicode documents.
 mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
     'binary', abc
     'tfidf', abc
     'count', abc
     'freq', abc
     'seq', abc
 token_not_found: 'ignore', 'raise', a token string, an integer
     pass
 """
     # ====== check arguments ====== #
     texts = self._validate_texts(texts)
     # ====== check mode ====== #
     mode = str(mode)
     if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
         raise ValueError('The "mode" argument must be: "seq", "binary", '
                          '"count", "freq", or "tfidf".')
     # ====== check token_not_found ====== #
     if not is_number(token_not_found) and \
     not is_string(token_not_found) and \
     token_not_found not in ('ignore', 'raise'):
         raise ValueError('token_not_found can be: "ignore", "raise"'
                          ', an integer of token index, or a string '
                          'represented a token.')
     if token_not_found not in ('ignore', 'raise'):
         token_not_found = int(self.dictionary[token_not_found])
     elif is_number(token_not_found):
         token_not_found = int(token_not_found)
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== Initialize variables ====== #
     dictionary = self.dictionary
     results = []
     # ====== preprocess arguments ====== #
     if isinstance(end_document, str):
         end_document = dictionary.index(end_document)
     elif is_number(end_document):
         end_document = int(end_document)
     # ====== processing ====== #
     if hasattr(texts, '__len__'):
         target_len = len(texts)
         auto_adjust_len = False
     else:
         target_len = 1234
         auto_adjust_len = True
     prog = Progbar(target=target_len,
                    name="Tokenize Transform",
                    print_report=True,
                    print_summary=True)
     for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
         # found the word in dictionary
         vec = []
         for x in doc:
             idx = dictionary.get(x, -1)
             if idx >= 0:
                 vec.append(idx)
                 # not found the token in dictionary
             elif token_not_found == 'ignore':
                 continue
             elif token_not_found == 'raise':
                 raise RuntimeError(
                     'Cannot find token: "%s" in dictionary' % x)
             elif isinstance(token_not_found, int):
                 vec.append(token_not_found)
         # append ending document token
         if end_document is not None:
             vec.append(end_document)
         # add the final results
         results.append(vec)
         # print progress
         if self.print_progress:
             prog['#Docs'] = nb_docs
             prog.add(1)
             if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # end the process
     # if self.print_progress and auto_adjust_len:
     #     prog.target = nb_docs; prog.update(nb_docs)
     # ====== pad the sequence ====== #
     # just transform into sequence of tokens
     if mode == 'seq':
         maxlen = self.longest_document_length if maxlen is None \
             else int(maxlen)
         results = pad_sequences(results,
                                 maxlen=maxlen,
                                 dtype=dtype,
                                 padding=padding,
                                 truncating=truncating,
                                 value=value)
     # transform into one-hot matrix
     else:
         X = np.zeros(shape=(len(results), self.nb_words))
         for i, seq in enumerate(results):
             if mode == 'binary':
                 X[i, seq] = 1
             elif mode == 'freq':
                 length = len(seq)
                 count = freqcount(seq)
                 for tok, n in count.items():
                     X[i, tok] = n / float(length)
             elif mode == 'count':
                 count = freqcount(seq)
                 for tok, n in count.items():
                     X[i, tok] = n
             elif mode == 'tfidf':
                 count = freqcount(seq)
                 for tok, n in count.items():
                     tf = 1 + np.log(n)
                     docs_freq = self._word_dictionary_info.get(
                         tok, (0, 0))[-1]
                     idf = np.log(1 + self.nb_docs / (1 + docs_freq))
                     X[i, tok] = tf * idf
         results = X
     return results
Пример #3
0
for name, (start, end) in indices:
    assert end - start > 0
    if name.split('_')[0] == 'train':
        train.append((name, (start, end)))
    else:
        test.append((name, (start, end)))
    max_length = max(end - start, max_length)
    min_length = min(end - start, min_length)
print(ctext("#Train:", 'yellow'), len(train), train[:2])
print(ctext("#Test:", 'yellow'), len(test), test[:2])
print("Min Length:", ctext(min_length, 'cyan'))
print("Max Length:", ctext(max_length, 'cyan'))
# ====== gender and single digit distribution ====== #
gender_digit = lambda x: x[0].split('_')[1] + '-' + x[0].split('_')[-1]
print(
    print_dist(d=freqcount(train, key=gender_digit),
               show_number=True,
               title="Training distribution"))
print(
    print_dist(d=freqcount(test, key=gender_digit),
               show_number=True,
               title="Testing distribution"))
# ====== digits ====== #
f_digits, digits = unique_labels([i[0] for i in train + test],
                                 key_func=lambda x: x.split('_')[-1],
                                 return_labels=True)
print(ctext("All digits:", 'yellow'), ctext(digits, 'cyan'))
# ====== genders ====== #
f_genders, genders = unique_labels([i[0] for i in train + test],
                                   key_func=lambda x: x.split('_')[1],
                                   return_labels=True)
Пример #4
0
nb_classes = 10  # 10 digits (0-9)

# ===========================================================================
# Create feeder
# ===========================================================================
indices = [(name, start, end) for name, (start, end) in ds['indices']]
longest_utterances = max(
    int(end) - int(start) - 1 for i, start, end in indices)
longest_vad = max(end - start for name, vad in ds['vadids']
                  for (start, end) in vad)
print("Longest Utterance:", longest_utterances)
print("Longest Vad:", longest_vad)

np.random.shuffle(indices)
train, valid, test = train_valid_test_split(indices, train=0.6, inc_test=True)
print('Nb train:', len(train), freqcount([int(i[0][0]) for i in train]))
print('Nb valid:', len(valid), freqcount([int(i[0][0]) for i in valid]))
print('Nb test:', len(test), freqcount([int(i[0][0]) for i in test]))

train_feeder = F.Feeder(ds['mspec'], train, ncpu=1)
test_feeder = F.Feeder(ds['mspec'], test, ncpu=2)
valid_feeder = F.Feeder(ds['mspec'], valid, ncpu=2)

recipes = [
    F.recipes.Name2Trans(converter_func=lambda x: int(x[0])),
    F.recipes.Normalization(mean=ds['mspec_mean'],
                            std=ds['mspec_std'],
                            local_normalize=False),
    F.recipes.Sequencing(frame_length=longest_utterances,
                         hop_length=1,
                         end='pad',
Пример #5
0
Файл: text.py Проект: imito/odin
 def transform(self, texts, mode='seq', dtype='int32',
               padding='pre', truncating='pre', value=0.,
               end_document=None, maxlen=None,
               token_not_found='ignore'):
   """
   Parameters
   ----------
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
       'binary', abc
       'tfidf', abc
       'count', abc
       'freq', abc
       'seq', abc
   token_not_found: 'ignore', 'raise', a token string, an integer
       pass
   """
   # ====== check arguments ====== #
   texts = self._validate_texts(texts)
   # ====== check mode ====== #
   mode = str(mode)
   if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
     raise ValueError('The "mode" argument must be: "seq", "binary", '
                      '"count", "freq", or "tfidf".')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== Initialize variables ====== #
   dictionary = self.dictionary
   results = []
   # ====== preprocess arguments ====== #
   if isinstance(end_document, str):
     end_document = dictionary.index(end_document)
   elif is_number(end_document):
     end_document = int(end_document)
   # ====== processing ====== #
   if hasattr(texts, '__len__'):
     target_len = len(texts)
     auto_adjust_len = False
   else:
     target_len = 1208
     auto_adjust_len = True
   prog = Progbar(target=target_len, name="Tokenize Transform",
                  print_report=True, print_summary=True)
   for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
     # found the word in dictionary
     vec = []
     for x in doc:
       idx = dictionary.get(x, -1)
       if idx >= 0: vec.append(idx)
       # not found the token in dictionary
       elif token_not_found == 'ignore':
         continue
       elif token_not_found == 'raise':
         raise RuntimeError('Cannot find token: "%s" in dictionary' % x)
       elif isinstance(token_not_found, int):
         vec.append(token_not_found)
     # append ending document token
     if end_document is not None:
       vec.append(end_document)
     # add the final results
     results.append(vec)
     # print progress
     if self.print_progress:
       prog['#Docs'] = nb_docs
       prog.add(1)
       if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
         prog.target = 1.2 * prog.target
   # end the process
   # if self.print_progress and auto_adjust_len:
   #     prog.target = nb_docs; prog.update(nb_docs)
   # ====== pad the sequence ====== #
   # just transform into sequence of tokens
   if mode == 'seq':
     maxlen = self.longest_document_length if maxlen is None \
         else int(maxlen)
     results = pad_sequences(results, maxlen=maxlen, dtype=dtype,
                             padding=padding, truncating=truncating,
                             value=value)
   # transform into one-hot matrix
   else:
     X = np.zeros(shape=(len(results), self.nb_words))
     for i, seq in enumerate(results):
       if mode == 'binary':
         X[i, seq] = 1
       elif mode == 'freq':
         length = len(seq)
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n / float(length)
       elif mode == 'count':
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n
       elif mode == 'tfidf':
         count = freqcount(seq)
         for tok, n in count.items():
           tf = 1 + np.log(n)
           docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1]
           idf = np.log(1 + self.nb_docs / (1 + docs_freq))
           X[i, tok] = tf * idf
     results = X
   return results
Пример #6
0
    ((name, start, end) for name, (start, end) in ds['indices'].iteritems()),
    key=lambda x: x[0])
all_labels = list(set(i[0].split("_")[0] for i in indices))
print("Labels:", all_labels)
np.random.shuffle(indices)
np.random.shuffle(indices)
longest_utterance = max(int(end - start) for name, start, end in indices)
print("Longest Utterance:", longest_utterance)
nb_files = len(indices)

train_indices = indices[:int(0.6 * nb_files)]
valid_indices = indices[int(0.6 * nb_files):int(0.8 * nb_files)]
test_indices = indices[int(0.8 * nb_files):]

print("Train distribution:", len(train_indices),
      freqcount([x[0].split('_')[0] for x in train_indices]).items())
print("Valid distribution:", len(valid_indices),
      freqcount([x[0].split('_')[0] for x in valid_indices]).items())
print("Test distribution:", len(test_indices),
      freqcount([x[0].split('_')[0] for x in test_indices]).items())

train = F.Feeder(ds[FEAT], train_indices, ncpu=1)
valid = F.Feeder(ds[FEAT], valid_indices, ncpu=1)
test = F.Feeder(ds[FEAT], test_indices, ncpu=1)

recipes = [
    F.recipes.Name2Trans(
        converter_func=lambda x: all_labels.index(x.split("_")[0])),
    F.recipes.Normalization(mean=ds[FEAT + "_mean"],
                            std=ds[FEAT + "_std"],
                            local_normalize=False),
Пример #7
0
  # ====== return ====== #
  # Header:
  #  0       1      2         3         4
  # path, channel, name, noise_type, duration
  return {key: np.array(sorted(val, key=lambda x: x[0]))
          for key, val in all_files.items()}
# ==================== run the validation ==================== #
if CURRENT_STATE == SystemStates.EXTRACT_FEATURES:
  ALL_NOISE = validating_noise_data(
      in_path_raw=PATH_RAW_DATA)
  print("Processed noise data:")
  for ds_name, noise_list in ALL_NOISE.items():
    print(" ", ctext(ds_name, 'yellow'), ':', noise_list.shape)
    if len(noise_list) == 0:
      continue
    for name, count in sorted(freqcount(noise_list[:, 3]).items(),
                              key=lambda x: x[0]):
      print('  ', ctext('%-10s' % name, 'yellow'), ':',
            '%s(files)' % ctext('%-6d' % count, 'cyan'))
# ===========================================================================
# Validating the file list of training data
# ===========================================================================
@cache_disk
def validating_training_data(in_path_raw, training_dataset):
  file_list = {ds: sre_file_list[ds]
               for ds in training_dataset
               if ds in sre_file_list}
  # ====== meta info ====== #
  all_files = []
  non_exist_files = []
  extension_count = defaultdict(int)