def train_and_test(self, dataset, batch_size): self.batch_size = batch_size x_train, y_train = dataset.get_training_data() num_examples = dataset.get_num_training_examples() dropped = 0 for idx in range(num_examples): if idx % 100 == 0: print('Training on {} out of {} examples'.format( idx, num_examples)) mfcc_vec_seq = x_train[idx] phoneme_idx = y_train[idx] # drop sequences that are too short if len(mfcc_vec_seq) < self.n_states: print('dropped') dropped += 1 else: # find an appropriate hmm and fit using EM hmm = self.hmms[phoneme_idx] # add phonem frequency if not phoneme_idx in self.phonem_freq: self.phonem_freq[phoneme_idx] = 0 self.phonem_freq[phoneme_idx] += 1 hmm.fit(mfcc_vec_seq) # calculate phonem frequences phonem_num = sum(self.phonem_freq.values()) for key in self.phonem_freq: self.phonem_freq[key] /= float(phonem_num) print(self.phonem_freq[key]) print(key) print('Done training, dropped {} out of {}'.format( dropped, num_examples)) self.test_on_random_training_batch(dataset, self.batch_size) self.test(dataset)
def train(dataset): # Get all vectors in the datasets all_vectors = np.concatenate( [np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0) print("vectors", all_vectors.shape) # Run K-Means algorithm to get clusters kmeans = clustering(all_vectors) print("centers", kmeans.cluster_centers_.shape) models = {} for cname in CLASS_NAMES: # print(cname[:4]) # class_vectors = dataset[cname] # convert all vectors to the cluster index # dataset['one'] = [O^1, ... O^R] # O^r = (c1, c2, ... ct, ... cT) # O^r size T x 1 dataset[cname] = list( [kmeans.predict(v).reshape(-1, 1) for v in dataset[cname]]) #define model hmm = hmm_model() if 'test' not in cname: X = np.concatenate(dataset[cname]) lengths = list([len(x) for x in dataset[cname]]) print("training class", cname) print(X.shape, lengths, len(lengths)) hmm.fit(X, lengths=lengths) models[cname] = hmm print("Training done") return models
def train_hmm_and_keep_track_of_log_likelihood(hmm, obs, n_iter=1, **kwargs): hmm.n_iter = 1 hmm.fit(obs) loglikelihoods = [] for n in range(n_iter): hmm.n_iter = 1 hmm.init_params = '' hmm.fit(obs) loglikelihoods.append(sum(hmm.score(x) for x in obs)) return loglikelihoods
def test_ala2(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianHMM(n_states=4, n_init=3) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def test_ala2(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def test_pickle(): """Test pickling an HMM""" trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs) hmm.fit(sequences) logprob, hidden = hmm.predict(sequences) with tempfile.TemporaryFile() as savefile: pickle.dump(hmm, savefile) savefile.seek(0, 0) hmm2 = pickle.load(savefile) logprob2, hidden2 = hmm2.predict(sequences) assert (logprob == logprob2)
def test_pickle(): """Test pickling an HMM""" trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs) hmm.fit(sequences) logprob, hidden = hmm.predict(sequences) with tempfile.TemporaryFile() as savefile: pickle.dump(hmm, savefile) savefile.seek(0, 0) hmm2 = pickle.load(savefile) logprob2, hidden2 = hmm2.predict(sequences) assert(logprob == logprob2)
def train(dataset): # Get all vectors in the datasets all_vectors = np.concatenate( [np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0) print("vectors", all_vectors.shape) # Run K-Means algorithm to get clusters kmeans = clustering(all_vectors) print("centers", kmeans.cluster_centers_.shape) models = {} for cname in CLASS_NAMES: # print(cname[:4]) # class_vectors = dataset[cname] # convert all vectors to the cluster index # dataset['one'] = [O^1, ... O^R] # O^r = (c1, c2, ... ct, ... cT) # O^r size T x 1 dataset[cname] = list( [kmeans.predict(v).reshape(-1, 1) for v in dataset[cname]]) if cname == "benh_nhan": hmm = hmm_model(N_COMPONENT_BN, START_PROB_BN, TRANSMAT_PRIOR_BN) elif cname == "cua": hmm = hmm_model(N_COMPONENT_CUA, START_PROB_CUA, TRANSMAT_PRIOR_CUA) elif cname == "khong": hmm = hmm_model(N_COMPONENT_KHONG, START_PROB_KHONG, TRANSMAT_PRIOR_KHONG) elif cname == "nguoi": hmm = hmm_model(N_COMPONENT_NGUOI, START_PROB_NGUOI, TRANSMAT_PRIOR_NGUOI) #define model # hmm = hmm_model() if 'test' not in cname: X = np.concatenate(dataset[cname]) lengths = list([len(x) for x in dataset[cname]]) print("training class", cname) print(X.shape, lengths, len(lengths)) hmm.fit(X, lengths=lengths) models[cname] = hmm print("Training done") return models
def train(data_train, state_num): models = {} dataset = data_train.copy() for cname in dataset.keys(): n = state_num[cname] startprob = np.zeros(n) startprob[0] = 1 transmat = np.diag(np.full(n, 1)) hmm = hmmlearn.hmm.MultinomialHMM( n_components=n, random_state=0, n_iter=1000, verbose=False, startprob_prior=startprob, transmat_prior=transmat, ) X = np.concatenate(dataset[cname]) lengths = list([len(x) for x in dataset[cname]]) hmm.fit(X, lengths=lengths) models[cname] = hmm return models
params='mctw', init_params='mst') hmm.startprob_ = np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) hmm.transmat_ = np.array([ [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0], [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0], [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0], [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0], [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1], [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], ]) X = np.concatenate(dataset_train[cname]) lengths = list([len(x) for x in dataset_train[cname]]) hmm.fit(X) models[cname] = hmm print("Training done") with open("gmm_hmm.pkl", "wb") as file: pickle.dump(models, file) print("Saved!") print("Testing") #class_names = ['khong', 'toi', 'trong', 'amtinh',"test_toi","test_trong","test_khong","test_amtinh"] for true_cname in class_names: #if true_cname[:4] == 'test': true_predict = 0 # for O in dataset[true_cname]: for O in dataset_test[true_cname]:
0.1, 0.1, 0.5, ], [ 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, ], ]), ) if cname[:4] != 'test': X = np.concatenate(dataset[cname]) lengths = list([len(x) for x in dataset[cname]]) print("training class", cname) print(X.shape, lengths, len(lengths)) hmm.fit(X, lengths=lengths) models[cname] = hmm print("Training done") print("Testing") for true_cname in class_names: for O in dataset[true_cname]: score = { cname: model.score(O, [len(O)]) for cname, model in models.items() if cname[:4] != 'test' } print(true_cname, score)
def train(evaluate=False): ''' if evaluate is True, test data will be choosen randomly from train data set however, you need to backup your self-record test data before use this evaluate ''' global trained trained = True class1 = ["toi", "test_toi"] class2 = ["mot", "test_mot"] class3 = ["trong", "test_trong"] class4 = ["thoigian", "test_thoigian"] class5 = ["chungta", "test_chungta"] class_names = [] class_names.extend(class1) class_names.extend(class2) class_names.extend(class3) class_names.extend(class4) class_names.extend(class5) dataset = {} if evaluate is True: for cname in class_names: if cname[:4] == "test": cname_ = cname.split("_")[1] data_dir_src = os.path.join("data", cname_) data_dir_dst = os.path.join("data", cname) samples = random.sample([x for x in os.listdir(data_dir_src) if os.path.isfile(os.path.join(data_dir_src, x))], 10) [copyfile(os.path.join(data_dir_src, x), os.path.join(data_dir_dst, x)) for x in samples] for cname in class_names: print(f"Load {cname} dataset") dataset[cname] = get_class_data(os.path.join("data", cname)) all_vectors = np.concatenate([np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0) kmeans = clustering(all_vectors) kmeans_model_filename = 'kmeans.joblib' with open (kmeans_model_filename, 'wb') as f_kmeans: joblib.dump(kmeans, f_kmeans) print("centers", kmeans.cluster_centers_.shape) config = { 'toi': {'n_components':5}, 'mot': {'n_components': 5}, 'trong': {'n_components': 5}, 'thoigian': {'n_components': 8}, # 10 'chungta': {'n_components': 8}, 'test_toi': {'n_components': 5}, 'test_mot': {'n_components': 5}, 'test_trong': {'n_components': 5}, 'test_thoigian': {'n_components': 8}, # 10 'test_chungta': {'n_components': 8}, 'demo': {'n_components': 3}, } for cname in class_names: dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in dataset[cname]]) n_components = config[cname]['n_components'] start_prob = np.zeros(n_components) start_prob[0] = 1.0 transmat = np.ndarray(shape=(n_components, n_components), dtype=float) for i in range((n_components - 1)): transmat[i][i] = 0.7 transmat[i][i + 1] = 0.3 transmat[n_components - 1][n_components - 1] = 1.0 hmm = hmmlearn.hmm.MultinomialHMM( n_components=n_components, random_state=0, n_iter=1000, verbose=True, transmat_prior=transmat, startprob_prior=start_prob, init_params='ste', params='ste' ) # hmm.startprob_ = start_prob # hmm.transmat_ = transmat if cname[:4] != 'test': X = np.concatenate(dataset[cname]) lengths = list([len(x) for x in dataset[cname]]) print("training class", cname) hmm.fit(X, lengths=lengths) models[cname] = hmm print("Training done") model_filename = 'finalized_model.joblib' with open (model_filename, 'wb') as f_hmm: joblib.dump(models, f_hmm) if evaluate is True: print("Testing") for true_cname in class_names: if true_cname != 'demo': count = 0 for O in dataset[true_cname]: score = {cname : model.score(O, [len(O)]) for cname, model in models.items() if cname[:4] != 'test' and cname != 'demo' } predict = max(score, key = score.get) if predict == true_cname or true_cname[:4] == 'test' and predict == true_cname.split('_')[1]: count+=1 # print(true_cname, score, predict) print(f"true: {count}/{len(dataset[true_cname])}") for cname in class_names: if cname[:4] == "test": data_dir_dst = os.path.join("data", cname) samples = os.listdir(data_dir_dst) [os.remove(os.path.join(data_dir_dst, x)) for x in samples] return models
def main(): n_states = { 'khong': 11, 'vietnam': 24, 'nguoi': 11, 'benhvien': 24, 'trong': 11 } #Load dataset class_names = [ f for f in os.listdir('data') if os.path.isdir(os.path.join('data', f)) ] dataset = {} for cname in class_names: print(f"-->Load {cname} dataset") dataset[cname] = get_class_data(cname) # Get all vectors in the datasets all_vectors = np.concatenate( [np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0) print("vectors", all_vectors.shape) # Run K-Means algorithm to get clusters kmeans = clustering(all_vectors, n_clusters=35) #Train model models = {} class_vectors = dataset.copy() for cname in class_names: # convert all vectors to the cluster index # dataset['one'] = [O^1, ... O^R] # O^r = (c1, c2, ... ct, ... cT) # O^r size T x 1 class_vectors[cname] = list( [kmeans.predict(v).reshape(-1, 1) for v in class_vectors[cname]]) if cname[:4] != 'test': hmm = hmmlearn.hmm.MultinomialHMM( n_components=n_states[cname], random_state=2020, n_iter=1000, verbose=False, init_params='e', params='te', ) hmm.startprob_ = np.array(make_pi(n_states[cname])) hmm.transmat_ = np.array(make_A(n_states[cname])) X = np.concatenate(class_vectors[cname]) lengths = list([len(x) for x in class_vectors[cname]]) print("training class", cname) print(X.shape, lengths, len(lengths)) hmm.fit(X, lengths=lengths) models[cname] = hmm print("<--Training done-->\n") print("-----Testing-----") print("Test in Datatrain") for test_cname in class_names: cnt = 0 if test_cname[:4] != "test": for O in class_vectors[test_cname]: score = { cname: model.score(O, [len(O)]) for cname, model in models.items() if cname[:4] != 'test' } max_value = max(v for k, v in score.items()) #print("Max: ", max_value) for k, v in score.items(): if v == max_value: if k == test_cname: cnt += 1 #print(test_cname, score) print(f"{test_cname} -- Score: ", cnt / len(class_vectors[test_cname])) print() print("Test in Datatest") for test_cname in class_names: cnt = 0 if test_cname[:4] == "test": for O in class_vectors[test_cname]: score = { cname: model.score(O, [len(O)]) for cname, model in models.items() if cname[:4] != 'test' } max_value = max(v for k, v in score.items()) #print("Max: ", max_value) for k, v in score.items(): if v == max_value: predict = k if predict.strip() == test_cname[5:].strip(): cnt += 1 #print(test_cname, score) print(f"{test_cname} -- Score: ", cnt / len(class_vectors[test_cname])) #Extract models parameters with open("Models_parameters.txt", "w") as f: for cname, model in models.items(): f.write(f"Model_name : {cname}\n") f.write("Startprob matrix:\n") f.write(" ".join(map(str, model.startprob_))) f.write("\nTransition Matrix\n") f.write(" ".join(map(str, model.transmat_))) f.write("\nEmissionProb Matrix\n") f.write(" ".join(map(str, model.emissionprob_))) f.write("\n\n") print("Extracted models to Models_parameters.txt successfully") #Save models if "models" not in os.listdir(): os.mkdir("models") #Kmeans with open(os.path.join("models", "kmeans.pkl"), "wb") as f: pickle.dump(kmeans, f) print("Saved Kmeans model to 'models/kmeans.pkl' successfully") #HMM with open(os.path.join("models", "models.pkl"), "wb") as f: pickle.dump(models, f) print(f"Saved HMMs model to 'models/models.pkl' successfully")