except IOError: print "cache not found. Recomputing" c = fun(*args) np.save(fname, c) return c pi, A, pi_v, theta_v, word_freq = cache_or_compute("cache/bigram.npy", compute_bigram, debug=False) ################################################################### # Segmentation ################################################################### starts, ends, chunks = cache_or_compute( "cache/chunks.npy", lambda arg: get_chunk_starts(arg), data[int(file_range[0] * len(data)) : int(file_range[1] * len(data))], debug=False, ) if not isinstance(starts, list): starts = starts.astype(int) ends = ends.astype(int) if "Segmentation" in PRINT_SET: print "num_chunks", len(chunks) spaces = cache_or_compute("cache/spaces.npy", find_spaces, data, starts, ends) N, M = 0, 1000000 ii1 = np.zeros(len(data))
Example: ./aggregate 7 sound7.*.wav will aggregate sound files sound7.1.wav sound7.2.wav, etc with corresponding text files text7.1.txt text7.2.txt The output aggregation will be guaranteed to have perfect segmentation ''' import sys from mlalgs import (load_data, get_chunk_starts) import scipy.io.wavfile import numpy as np tot = [] tot_txt = [] tot_chars = 0 for f in sys.argv[2:]: text_file = 'text' + f[5:-3] + 'txt' rate, data, text = load_data(f, text_file) starts, _, _ = get_chunk_starts(data) if len(starts) != len(text): print '%s rejected: %d != %d' % (f, len(starts), len(text)) continue tot_chars += len(text) tot.append(data) tot_txt.append(text) print 'Created data file with %d characters' % tot_chars scipy.io.wavfile.write('sound%s.wav' % sys.argv[1], rate, np.concatenate(tot)) with open('text%s.txt' % sys.argv[1], "w") as f: f.write(''.join(tot_txt))
real.append(c) if minj == letters.index(c): score += 1 print ''.join(pred) print ''.join(real) return means, stds, score/float(len(test)) if __name__ == '__main__': if len(sys.argv) != 4: print 'Usage: %s training|test soundf textf' % sys.argv[0] soundf = sys.argv[2] textf = sys.argv[3] rate, data, text = load_data(soundf, textf) starts, ends, chunks = get_chunk_starts(data) f = get_features(data, starts, ends, include_fft=True, include_cepstrum=True) if sys.argv[1] == 'training': means, stds, score = naive_bayes(text, f) print 'Naive Bayes', score logreg_score, logreg = logistic_test(text, f) svm_score, svm = svm_test(text, f) joblib.dump(logreg, 'cache/logistic.pkl') print 'Logistic test', logreg_score print 'SVM test', svm_score else: try: logreg = joblib.load('cache/logistic.pkl') except: