def train_and_save(parameters, dataset, filename): """Creates a HMMChain and, loads its parameters, trains it on dataset and saves the results in filename. """ chain = hmm_chain.HMMChain() chain.set_params(**parameters) chain.my_class = None chain.other_classes = None dataset, _, _ = create_dataset_crossvalidation(dataset) chain.fit(dataset) score = chain.score(dataset) display_name = os.path.split(filename)[1].split(".")[0] longline = "========%s========" % display_name print print longline print "After training the score is ", score print "with parameters: %s" % parameters print "=" * len(longline) print with open(filename, "w") as f: print "Saving file: ", filename cPickle.dump(chain, f, protocol=cPickle.HIGHEST_PROTOCOL)
def train_and_save(parameters, dataset, filename): chain = hmm_chain.HMMChain() chain.set_params(**parameters) chain.my_class = None chain.other_classes = None chain.fit(dataset) score = chain.score(dataset) print "After training the score is ", score with open(filename, "w") as f: print "Saving file: ", filename cPickle.dump(chain, f, protocol=cPickle.HIGHEST_PROTOCOL)
def train_dataset(dataset): """Uses cross validation to train a HMM chain. Returns the parameters that yield the best result. CHANGE THE PARAMETERS HERE TO USE A SUITABLE RANGE, THESE ARE ONLY HERE FOR TESTING! """ dataset, train_indexes, test_indexes = create_dataset_crossvalidation(dataset) cv = [(train_indexes, test_indexes)] parameters = [ dict(n_pca_components = [0.97], n_hidden_components=[12, 15, 18], resampling_size=[20, 25, 30], n_discretization_symbols=[5, 10, 12], hmm_max_iter = [100], #kmeans_max_iter = [1000] ), #dict(n_pca_components = [0.97], #n_hidden_components=[40, 50], #resampling_size=[20], #n_discretization_symbols=[30, ], #hmm_max_iter = [2000], ##kmeans_max_iter = [1000] #), ] '''parameters = [ dict(n_pca_components = [0.97], n_hidden_components=[15, 18, 25], resampling_size=[30, 40, 50], n_discretization_symbols=[10, 12, 15], hmm_max_iter = [300], ), ]''' #print "Using parameters:\n", parameters chain = hmm_chain.HMMChain() grid = sklearn.grid_search.GridSearchCV(chain, parameters, cv = cv, verbose = 10, n_jobs = 6, refit = False ) grid.fit(dataset) return grid.best_params_
def train_dataset(dataset, all_adjectives, adjective): parameters = [ dict( n_pca_components=[0.97], n_hidden_components=[35, 40, 45], resampling_size=[20], n_discretization_symbols=[ 25, ], hmm_max_iter=[2000], #kmeans_max_iter = [1000] ), #dict(n_pca_components = [0.97], #n_hidden_components=[40, 50], #resampling_size=[20], #n_discretization_symbols=[30, ], #hmm_max_iter = [2000], ##kmeans_max_iter = [1000] #), ] print "Using parameters:\n", parameters chain = hmm_chain.HMMChain() cross_validator = sklearn.cross_validation.ShuffleSplit(len(dataset), n_iterations=2, train_size=3. / 4.) for p in parameters: p.update(my_class=[adjective], other_classes=[all_adjectives]) grid = sklearn.grid_search.GridSearchCV(chain, parameters, cv=cross_validator, verbose=10, n_jobs=6, refit=False) grid.fit(dataset) return grid.best_params_
from pylab import * import utilities import hmm_chain import cPickle bumpy = cPickle.load( open("/home/pezzotto/log/bigbags/bag_files/databases/bumpy.pkl")) pdc = bumpy['SLIDE_5CM']['pdc'] splits = [len(d) for d in pdc] hmm = hmm_chain.HMMChain(data_splits=splits, n_pca_components=1, resampling_size=50, n_discretization_symbols=5) hmm.update_splits(pdc) pca = hmm.pca pca.fit(vstack(pdc)) Xt = hmm.splitter.transform(pca.transform(hmm.combiner.transform(pdc))) Xt = hmm.resample.fit_transform(Xt) Xt = hmm.combiner.transform(Xt) hmm.discretizer.fit(Xt) Xt = hmm.discretizer.transform(Xt) Xt = hmm.splitter2.transform(Xt) hmm.hmm.fit(Xt) print "Score: ", hmm.score(pdc) print "Using the whole training" pdc = bumpy['SLIDE_5CM']['pdc'] hmm.fit(pdc) print "Score: ", hmm.score(pdc)