예제 #1
0
def train_and_save(parameters, dataset, filename):
    """Creates a HMMChain and, loads its parameters, trains it on dataset and
    saves the results in filename.    
    """
    
    chain = hmm_chain.HMMChain()
    chain.set_params(**parameters)
    chain.my_class = None
    chain.other_classes = None
    
    dataset, _, _ = create_dataset_crossvalidation(dataset)    
    chain.fit(dataset)

    score = chain.score(dataset)
    
    display_name = os.path.split(filename)[1].split(".")[0]
    longline = "========%s========" % display_name
    print
    print longline
    print "After training the score is ", score
    print "with parameters: %s" % parameters
    print "=" * len(longline)
    print
    
    with open(filename, "w") as f:
        print "Saving file: ", filename
        cPickle.dump(chain, f, protocol=cPickle.HIGHEST_PROTOCOL)
예제 #2
0
def train_and_save(parameters, dataset, filename):
    chain = hmm_chain.HMMChain()
    chain.set_params(**parameters)
    chain.my_class = None
    chain.other_classes = None

    chain.fit(dataset)

    score = chain.score(dataset)
    print "After training the score is ", score

    with open(filename, "w") as f:
        print "Saving file: ", filename
        cPickle.dump(chain, f, protocol=cPickle.HIGHEST_PROTOCOL)
예제 #3
0
def train_dataset(dataset):
    """Uses cross validation to train a HMM chain. Returns the parameters 
    that yield the best result.
    
    CHANGE THE PARAMETERS HERE TO USE A SUITABLE RANGE, THESE ARE ONLY HERE FOR
    TESTING!
    """
    
    dataset, train_indexes, test_indexes = create_dataset_crossvalidation(dataset)
    cv = [(train_indexes, test_indexes)]
    
    parameters = [
              dict(n_pca_components = [0.97],
                   n_hidden_components=[12, 15, 18], 
                   resampling_size=[20, 25, 30], 
                   n_discretization_symbols=[5, 10, 12],
                   hmm_max_iter = [100],
                   #kmeans_max_iter = [1000]
                   ),  
              #dict(n_pca_components = [0.97],
                   #n_hidden_components=[40, 50], 
                   #resampling_size=[20], 
                   #n_discretization_symbols=[30, ],
                   #hmm_max_iter = [2000],
                   ##kmeans_max_iter = [1000]
                   #),              
              ]
    '''parameters = [ 
              dict(n_pca_components = [0.97],
                   n_hidden_components=[15, 18, 25], 
                   resampling_size=[30, 40, 50], 
                   n_discretization_symbols=[10, 12, 15],
                   hmm_max_iter = [300],
                   ),  
            ]'''
        
    #print "Using parameters:\n", parameters    
    
    chain = hmm_chain.HMMChain()
    grid = sklearn.grid_search.GridSearchCV(chain, parameters,
                                            cv = cv,
                                            verbose = 10,
                                            n_jobs = 6,
                                            refit = False                                            
                                            )
    grid.fit(dataset)
    
    return grid.best_params_
예제 #4
0
def train_dataset(dataset, all_adjectives, adjective):

    parameters = [
        dict(
            n_pca_components=[0.97],
            n_hidden_components=[35, 40, 45],
            resampling_size=[20],
            n_discretization_symbols=[
                25,
            ],
            hmm_max_iter=[2000],
            #kmeans_max_iter = [1000]
        ),
        #dict(n_pca_components = [0.97],
        #n_hidden_components=[40, 50],
        #resampling_size=[20],
        #n_discretization_symbols=[30, ],
        #hmm_max_iter = [2000],
        ##kmeans_max_iter = [1000]
        #),
    ]

    print "Using parameters:\n", parameters

    chain = hmm_chain.HMMChain()
    cross_validator = sklearn.cross_validation.ShuffleSplit(len(dataset),
                                                            n_iterations=2,
                                                            train_size=3. / 4.)

    for p in parameters:
        p.update(my_class=[adjective], other_classes=[all_adjectives])
    grid = sklearn.grid_search.GridSearchCV(chain,
                                            parameters,
                                            cv=cross_validator,
                                            verbose=10,
                                            n_jobs=6,
                                            refit=False)
    grid.fit(dataset)

    return grid.best_params_
예제 #5
0
from pylab import *
import utilities

import hmm_chain
import cPickle
bumpy = cPickle.load(
    open("/home/pezzotto/log/bigbags/bag_files/databases/bumpy.pkl"))
pdc = bumpy['SLIDE_5CM']['pdc']
splits = [len(d) for d in pdc]
hmm = hmm_chain.HMMChain(data_splits=splits,
                         n_pca_components=1,
                         resampling_size=50,
                         n_discretization_symbols=5)
hmm.update_splits(pdc)
pca = hmm.pca
pca.fit(vstack(pdc))
Xt = hmm.splitter.transform(pca.transform(hmm.combiner.transform(pdc)))
Xt = hmm.resample.fit_transform(Xt)
Xt = hmm.combiner.transform(Xt)
hmm.discretizer.fit(Xt)
Xt = hmm.discretizer.transform(Xt)
Xt = hmm.splitter2.transform(Xt)
hmm.hmm.fit(Xt)

print "Score: ", hmm.score(pdc)

print "Using the whole training"
pdc = bumpy['SLIDE_5CM']['pdc']
hmm.fit(pdc)
print "Score: ", hmm.score(pdc)