def main(argv): argv = FLAGS(argv) timit = load_timit(FLAGS.corpus_path) extractor = FeatureExtractor() utterance_names = [] if FLAGS.data_set == "brugnara": utterance_names = load_brugnara_files(timit) elif FLAGS.data_set == "hundred": utterance_names = load_brugnara_files(timit)[:100] elif FLAGS.data_set == "ten": utterance_names = load_brugnara_files(timit)[:10] elif FLAGS.data_set == "one": utterance_names = load_brugnara_files(timit)[:1] print utterance_names speech_problem = SpeechProblem(timit, FLAGS.output_name) speech_problem.extract_phonemes() for utterance_file in utterance_names: features = extractor.extract_features(timit.abspath(utterance_file + ".wav")) #features = extractor.random_features() speech_problem.add_utterance(utterance_file, features) speech_problem.extract_centers() #speech_problem.random_centers() speech_problem.write()
def main(argv): argv = FLAGS(argv) data_sets = [] timit_dev = load_timit(FLAGS.corpus_path + "TIMITNLTKTEST") if FLAGS.shrink_data: files = load_core_dev_files(timit_dev)[:10] else: files = load_core_dev_files(timit_dev) data_sets.append((timit_dev, files, "dev")) timit_train = load_timit(FLAGS.corpus_path + "TIMITNLTK39") if FLAGS.shrink_data: files = load_training_files(timit_train)[:10] else: files = load_training_files(timit_train) data_sets.append((timit_train, files, "train")) timit_test = load_timit(FLAGS.corpus_path + "TIMITNLTKTEST") if FLAGS.shrink_data: files = load_core_test_files(timit_test)[:10] else: files = load_core_test_files(timit_test) data_sets.append((timit_test, files, "test")) extractor = FeatureExtractor() all_features = [] all_states = [] utterance_features = [] feature_count = 0 for timit, utterance_names, _ in data_sets: for utterance_file in utterance_names: features = extractor.extract_features(timit.abspath(utterance_file + ".wav")) utterance_data = construct_gold(timit, utterance_file, features) #states = [state for state, feature in utterance_data] #assert(len(states) == len(features)) global_indices = [] for state, feature in utterance_data: all_features.append(feature) all_states.append(state) global_indices.append(feature_count) feature_count += 1 utterance_features.append(global_indices) vq = VQ(FLAGS.vq_size) vqs = vq.make_code_book(all_features, all_states) utterance_ind = 0 for timit, utterance_names, suffix in data_sets: file_name = FLAGS.output_prefix + "_" + str(FLAGS.vq_size) + "_" + suffix if FLAGS.shrink_data: file_name += "_shrink" out_file = open(file_name, 'w') all_utterances = [] for utterance_file in utterance_names: feature_inds = utterance_features[utterance_ind] utterance_ind += 1 vq_features = [vqs[ind] for ind in feature_inds] states = [all_states[ind] for ind in feature_inds] assert(len(vq_features) == len(states)) #utterance_data = construct_gold(timit, utterance_file, vq_features) #all_utterances += utterance_data print >>out_file, " ".join(["%s/%s"%(p,code) for (p, code) in izip(states , vq_features)]) if False: correct_steps = 0 total_steps = 0 phoneme_histogram = {} vq_histogram = {} for p, code in all_utterances: phoneme_histogram.setdefault(p, {}) phoneme_histogram[p].setdefault(code, 0) phoneme_histogram[p][code] += 1 vq_histogram.setdefault(code, {}) vq_histogram[code].setdefault(p, 0) vq_histogram[code][p] += 1 total_steps += 1 for p,groups in phoneme_histogram.iteritems(): print p pairs = groups.items() pairs.sort(key=lambda a: a[1]) pairs.reverse() total = sum([num for _,num in pairs]) print "\t", for code, nums in pairs: if nums / float(total) < 0.01: continue print "%3s:%3.2f "%(code, nums / float(total)), print for code in range(FLAGS.vq_size): if code not in vq_histogram: continue groups = vq_histogram[code] print code, pairs = groups.items() pairs.sort(key=lambda a: a[1]) pairs.reverse() total = sum([num for _,num in pairs]) correct_steps += pairs[0][1] print "", total, print "\t", print for p, nums in pairs: if nums / float(total) < 0.01: continue print "%3s:%3.2f "%(p, nums/ float(total)), print print total_steps, correct_steps, correct_steps / float(total_steps) print len(phoneme_histogram.keys())
print i, p # All the male utterances of a region. brugnara = set([l.strip() for l in open("corpus")]) #f.startswith("dr1-f") utterance_names = [f for f in timit.utterances() if f.split("-")[1].split("/")[0] in brugnara and "sa1" not in f and "sa2" not in f] print len(utterance_names) utterance_set = speech.UtteranceSet() all_features = [] #for utterance_file in utterance_names: for utterance_file in utterance_names: #for utterance_file in [ u for u in utterance_names if u == "dr8-mbcg0/sx57"]: # extract features from an audio file using AudioFileProcessor afp = AudioFileProcessor() afp.processFile(engine, timit.abspath(utterance_file + ".wav")) phone_times = timit.phone_times(utterance_file) print phone_times last = float(phone_times[-1][2]) features = engine.readAllOutputs() #print timit.sents(utterance_file) utterance = utterance_set.utterances.add() for phone in timit.phones(utterance_file): if phone == "q": continue utterance.phones.append(phoneme_map[phone]) #print features["mfcc"] final = len(features["mfcc"]) #print " ".join([ str(p) for (p, s, e) in phone_times])