def main(): """ First ARG: list of training files Second ARG: save name for model """ file1 = sys.argv[1] outname = sys.argv[2] file_list = [f[0:-1] for f in open(file1, 'r')] models, transitions, priors = calc_transmat(file_list) hmm = GaussianHMM( transitions.shape[0], "full", #startprob=priors, n_iter=500, transmat=transitions, init_params='mcs', params='mcs', ) feats, _ = load_feats_labels(file_list) feat, lab = load_feats_labels(file_list) #hmm.means_ = np.transpose(models['mean']) #hmm.covars_ = models['sigma'] print 'Fitting' start = timeit.default_timer() hmm.fit([np.transpose(feat)]) stop = timeit.default_timer() print 'Training Time: ' + str(stop - start) features, labels = load_feats_labels(['audio.arff']) _, seq = hmm.decode(np.transpose(features)) #print filter(lambda(x,y): x==y, zip(labels, map(int2label, seq))) print len(filter(lambda (x, y): x == y, zip(labels, map(int2label, seq)))) pickle.dump(hmm, open(outname, "wb")) plt.imshow(transitions, interpolation='nearest') plt.show()
def main(): """ First ARG: list of training files Second ARG: save name for model """ file1 = sys.argv[1] outname = sys.argv[2] file_list = [f[0:-1] for f in open(file1,'r')] models, transitions, priors = calc_transmat(file_list) hmm = GaussianHMM( transitions.shape[0], "full", #startprob=priors, n_iter=500, transmat=transitions, init_params='mcs', params='mcs', ) feats, _ = load_feats_labels(file_list) feat, lab = load_feats_labels(file_list) #hmm.means_ = np.transpose(models['mean']) #hmm.covars_ = models['sigma'] print 'Fitting' start = timeit.default_timer() hmm.fit([np.transpose(feat)]) stop = timeit.default_timer() print 'Training Time: ' + str(stop - start) features, labels = load_feats_labels(['audio.arff']) _, seq = hmm.decode(np.transpose(features)) #print filter(lambda(x,y): x==y, zip(labels, map(int2label, seq))) print len(filter(lambda(x,y): x==y, zip(labels, map(int2label, seq)))) pickle.dump(hmm, open(outname, "wb")) plt.imshow(transitions, interpolation='nearest') plt.show()
for i in range(n_states): print 'checking if initial covs are pos-definite' np.linalg.cholesky(covs[i]) print np.linalg.eigvals(covs[i]) tmat, smat = get_tmat_and_smat(pre_states, end=False, start=False) print tmat, smat model = GaussianHMM(n_components=n_states, n_iter=n_iter, covariance_type=cov_type, startprob=smat, transmat=tmat, init_params='mc') model.means_ = means model.covars_ = covs sum_inital_ll = 0.0 sum_initial_score = 0.0 sum_initial_map = 0.0 remove_idx = [] for idx, feat_from_list in enumerate(feats_as_list): if np.shape(feat_from_list)[0] > n_states: initial_ll, initial_best_seq = model.decode(feat_from_list) initial_map, initial_best_sep_map = model.decode(feat_from_list, algorithm='map') sum_initial_score += model.score(feat_from_list) sum_inital_ll += initial_ll sum_initial_map += initial_map else: remove_idx.append(idx) print 'too few samples in file', list_of_patient_file_paths[idx], np.shape(feat_from_list) print 'initial viterbi log-likelihood,', sum_inital_ll print 'initial score log-likelihood,', sum_initial_score print 'initial map log-likelihood', sum_initial_map remove_idx.sort() remove_idx.reverse() print 'removing...', remove_idx for r in remove_idx: del feats_as_list[r]
t, last_index = overlapped_samples(file_path, incident_reported_time=int(incident_time), overlap=5, window=10, with_end=2) if t is None: print file_path, 'is bad' else: model.means_ = means model.covars_ = covs print 'shape intial', np.shape(covs) ''' best_seq = model.decode(t) print 'intial,', best_seq print 'final means', model.means_ print 'initial trans', tmat print 'initial startprobs', smat, sum(smat) ''' model.fit([t]) best_seq = model.decode(t) print 'file', file_path print 'final,', best_seq #print 'final means', model.means_ #print 'final trans', model.transmat_ #print 'final startprob', model.startprob_ if np.isnan(model.means_).any() == False and np.isnan(model.covars_).any() == False: means = model.means_ covs = np.array([np.diag(model.covars_[0])]) for i in range(1, model.n_components): covs = np.vstack((covs, [np.diag(model.covars_[i])])) print 'shape after', np.shape(covs) tmat = model.transmat_
covars[covars==0] = 1e-5 model = GaussianHMM(numState, covariance_type="tied", n_iter=1000, init_params='abdefghijklnopqrstuvwxyzABDEFGHIJKLNOPQRSTUVWXYZ') model.means_ = means model.covars_ = covars print("Fitting model...") sys.stdout.flush() model.fit(data) print("Decoding states...") sys.stdout.flush() # do a loop over everything and record in one long array states = np.array([]) score = 0 for i in range(0, len(data)): hidden_states = model.decode(data[i]) states = np.append(states, hidden_states[1]) score = score + model.score(data[i]) print("Saving data...") sys.stdout.flush() # save the states and LLH np.savetxt("data/substates/%s%d/%d/rep_%d_states.txt" % (basepath,stateNum,numState,repInx), states, fmt="%d") with open("data/substates/%s%d/%d/rep_%d_LLH.txt" % (basepath,stateNum,numState,repInx), 'w') as f: f.write(str(score)) saveobject(model, "data/substates/%s%d/%d/rep_%d.pk" % (basepath,stateNum,numState,repInx))
means = np.array([[0.0, 0.0], [np.log1p(args.coverage), 0.0], [0.0, np.log1p(args.coverage)], [np.log1p(args.coverage / 2), np.log1p(args.coverage / 2)], [np.log1p(args.coverage), np.log1p(args.coverage)]]) cv = 1.0 covars = np.array([[0.01, 0.01], [cv, 0.01], [0.01, cv], [cv / 2, cv / 2], [cv, cv]]) hidden = ["private"] + ref_samples + ["heterozygous", "pseudohet"] hmm = GaussianHMM(n_components=len(means), random_state=rs) hmm._set_means(means) hmm._set_covars(covars) ## filter sites; compute observation sequence as log(1+count) keep = np.logical_and((counts.max(1) < args.X_max * args.coverage), (counts.sum(1) > -1.0)) counts = counts[keep, :] obs = np.log1p(counts) starts = np.array([start for start, end in ivls]).reshape((len(ivls), 1)) starts = starts[keep, :] ## run hmm states = hmm.decode(obs) ## print result to stdout for i in range(0, counts.shape[0]): print starts[i, 0], obs[i, 0], obs[i, 1], hidden[states[1][i]]
ax.set_xticks(range(start,end+1),minor=True) ax.legend() ax.grid(True,which='both') plt.show() ############################################################################## # Run HMM X_hmm = np.column_stack((y_train,X_train[['hour_of_day','weather','day_of_week']])) #X_hmm = np.column_stack((y_train,X_train[['hour_of_day','weather']])) #X_hmm = y_train from sklearn.hmm import GaussianHMM n_clusters = 9 #n_clusters = 17 model = GaussianHMM(n_clusters,covariance_type='diag',n_iter=1000) model.fit([X_hmm]) hidden_states = model.predict(X_hmm) viterbi_states = model.decode(X_hmm) x_ax = np.asarray(range(len(X_hmm))) x_ax = X_train['hour_of_day'] + X_train['day_of_week']*24 #x_ax = X_train['hour_of_day'] x_ax = np.asarray([item.to_datetime() for item in X_train.index]) def plot_HMM(n_clusters,hidden_states,x_ax,y_ax): #PLOT HIDDEN STATES fig = plt.figure() ax = fig.add_subplot(111) for i in xrange(n_clusters): print i idx = (hidden_states==i) if i<7: ax.plot(x_ax[idx],y_ax[idx],'o',label='%dth state'%i) elif i<14: ax.plot(x_ax[idx],y_ax[idx],'x',label='%dth state'%i)
means = np.array([ [ 0.0, 0.0 ], [ np.log1p(args.coverage), 0.0 ], [ 0.0, np.log1p(args.coverage) ], [ np.log1p(args.coverage/2), np.log1p(args.coverage/2) ], [ np.log1p(args.coverage), np.log1p(args.coverage) ] ]) cv = 1.0 covars = np.array([ [ 0.01, 0.01 ], [ cv, 0.01 ], [ 0.01, cv ], [ cv/2, cv/2 ], [ cv, cv ] ]) hidden = [ "private" ] + ref_samples + [ "heterozygous","pseudohet" ] hmm = GaussianHMM(n_components = len(means), random_state = rs) hmm._set_means(means) hmm._set_covars(covars) ## filter sites; compute observation sequence as log(1+count) keep = np.logical_and((counts.max(1) < args.X_max*args.coverage), (counts.sum(1) > -1.0)) counts = counts[ keep,: ] obs = np.log1p(counts) starts = np.array([ start for start,end in ivls ]).reshape( (len(ivls), 1) ) starts = starts[ keep,: ] ## run hmm states = hmm.decode(obs) ## print result to stdout for i in range(0, counts.shape[0]): print starts[i,0], obs[i,0], obs[i,1], hidden[ states[1][i] ]