def get_classifier(args):
    if args.model == 'hmm':
        model = GaussianHMM()
    elif args.model == 'fhmm-exact':
        model = ExactGaussianFHMM(n_chains=args.n_chains)
    elif args.model == 'fhmm-seq':
        model = SequentialGaussianFHMM(n_chains=args.n_chains)
    else:
        model = None
    assert model is not None
    model.n_training_iterations = args.n_training_iter
    model.n_states = args.n_states
    model.topology = args.topology
    model.verbose = args.verbose
    model.transition_init = args.transition_init
    model.emission_init = args.emission_init
    model.covar_type = args.covar_type
    return HMMClassifier(model, n_jobs=args.n_jobs)
def get_classifier(args):
    if args.model == 'hmmlearn':
        model = HMMLearnModel()
    elif args.model == 'fhmm-exact':
        model = ExactGaussianFHMM(n_chains=args.n_chains)
    elif args.model == 'fhmm-seq':
        model = SequentialGaussianFHMM(n_chains=args.n_chains)
    elif args.model == 'pomegranate':
        model = PomegranateModel()
    else:
        model = None
    assert model is not None
    model.n_training_iterations = args.n_iterations
    model.n_states = args.n_states
    model.topology = args.topology
    return HMMClassifier(model, n_jobs=args.n_jobs)
示例#3
0
def get_classifier(args):
    if args.model == 'hmmlearn':
        model = HMMLearnModel()
    elif args.model == 'fhmm-exact':
        model = ExactGaussianFHMM(n_chains=args.n_chains)
    elif args.model == 'fhmm-seq':
        model = SequentialGaussianFHMM(n_chains=args.n_chains)
    elif args.model == 'pomegranate':
        model = PomegranateModel()
    else:
        model = None
    assert model is not None
    model.n_training_iterations = args.n_iterations
    model.n_states = args.n_states
    model.topology = args.topology
    return HMMClassifier(model, n_jobs=args.n_jobs)
def evaluate(X, args):
    enum = ShuffleSplit(len(X), n_iter=args.n_iterations, test_size=args.test_size)
    train_scores = []
    test_scores = []
    for train_index, test_index in enum:
        X_train = [X[idx] for idx in train_index]
        X_test = [X[idx] for idx in test_index]
        X_train, X_test = preprocess_datasets(X_train, X_test, args)
        model = GaussianHMM(n_states=args.n_states, n_training_iterations=args.n_training_iterations,
                            topology=args.topology)
        model.fit(X_train)
        train_scores.extend([model.loglikelihood(X_curr) for X_curr in X_train])
        test_scores.extend([model.loglikelihood(X_curr) for X_curr in X_test])

    train_scores_array = np.array(train_scores)
    train_mean = float(np.mean(train_scores_array))
    train_std = float(np.std(train_scores_array))
    test_scores_array = np.array(test_scores)
    test_mean = float(np.mean(test_scores_array))
    test_std = float(np.std(test_scores_array))
    return train_mean, train_std, test_mean, test_std
def main(args):
    start_total = timeit.default_timer()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    # Validate that paths exist so that we don't need to check that whenever we use it
    if not os.path.exists(args.dataset):
        exit('data set at path "%s" does not exist' % args.dataset)

    # Print command again to make it easier to re-produce later from the logs
    print('python ' + ' '.join(sys.argv))
    print('')

    print('args:')
    print(args)
    print('')

    # Load dataset
    print('loading data set "%s" ...' % args.dataset)
    start = timeit.default_timer()
    with open(args.dataset, 'rb') as f:
        dataset = pickle.load(f)
        if type(dataset) != data.Dataset:
            raise ValueError('invalid dataset')
    print('done, took %fs' % (timeit.default_timer() - start))
    if args.features is not None and args.features != dataset.feature_names:
        print('selecting features ...')
        features = args.features
        start = timeit.default_timer()
        dataset = dataset.dataset_from_feature_names(features)
        print('done, took %fs' % (timeit.default_timer() - start))
    print('')

    # Print overview
    print('dataset overview:')
    print('  samples:  %d' % dataset.n_samples)
    print('  labels:   %s' % ', '.join(dataset.unique_labels))
    print('  features: %s' % ', '.join(dataset.feature_names))
    print('')

    transformers = data.transformers_from_args(args)
    dataset = dataset.dataset_from_transformers(transformers)

    model = GaussianHMM()
    model.n_training_iterations = 10
    model.n_states = 6
    model.topology = 'left-to-right-1'
    model.verbose = args.verbose
    model.transition_init = 'uniform'
    model.emission_init = 'k-means'
    model.covar_type = 'diag'
    classifier = HMMClassifier(model, n_jobs=args.n_jobs)

    print('training classifier ...')
    start = timeit.default_timer()
    classifier.fit(dataset.X, dataset.y)
    print('done, took %fs' % (timeit.default_timer() - start))

    total_scores = np.zeros(len(dataset.feature_names))
    for idx, model in enumerate(classifier.models_):
        label_name = dataset.unique_labels[idx]
        print('important features for %s:' % label_name)
        mean_covar = np.mean(model.model_.covars_, axis=0)

        # Reduce to a single score per feature
        scores = np.zeros(len(dataset.feature_names))
        start_idx = 0
        for feature_idx, length in enumerate(dataset.feature_lengths):
            end_idx = start_idx + length
            print('from %d to %d' % (start_idx, end_idx))
            scores[feature_idx] = np.mean(mean_covar[start_idx:end_idx])
            start_idx += length

        total_scores += scores
        #
        # sorted_exploded_feature_names = exploded_feature_names[sorted_features_indexes]
        # sorted_feature_names = []
        # feature_scores = {}
        # for name_idx, exploded_name in enumerate(sorted_exploded_feature_names):
        #     name = exploded_name.split('*')[0]
        #     if name not in sorted_feature_names:
        #         sorted_feature_names.append(name)
        #         feature_scores[name] = 0
        #     feature_scores[name] += name_idx
        # for name, length in zip(dataset.feature_names, dataset.feature_lengths):
        #     feature_scores[name] /= length
        # print np.array(feature_scores.keys())[np.argsort(feature_scores.values())]
        # print('')
        #
        # if total_feature_scores is None:
        #     total_feature_scores = feature_scores
        # else:
        #     for k, v in feature_scores.iteritems():
        #         total_feature_scores[k] += v
    total_scores /= dataset.n_labels
    print('')
    print('total scores:')
    sorted_indexes = np.argsort(total_scores)
    sorted_names = np.array(dataset.feature_names)[sorted_indexes]
    sorted_scores = total_scores[sorted_indexes]
    for name, score in zip(sorted_names, sorted_scores):
        print('%s: %f' % (name, score))
def get_classifier(args):
    if args.model == 'hmm':
        model = GaussianHMM()
    elif args.model == 'fhmm-exact':
        model = ExactGaussianFHMM(n_chains=args.n_chains)
    elif args.model == 'fhmm-seq':
        model = SequentialGaussianFHMM(n_chains=args.n_chains)
    else:
        model = None
    assert model is not None
    model.n_training_iterations = args.n_training_iter
    model.n_states = args.n_states
    model.topology = args.topology
    model.verbose = args.verbose
    model.transition_init = args.transition_init
    model.emission_init = args.emission_init
    model.covar_type = args.covar_type
    return HMMClassifier(model, n_jobs=args.n_jobs)