def export(dataset, args): fmt = args.format output = args.output transformers = data.transformers_from_args(args) if len(transformers) > 0: print('applying transformers %s ...' % transformers) dataset = dataset.dataset_from_transformers(transformers) print('exporting dataset ...') if fmt == 'matlab' or fmt == 'matlab-stacked': matlab_data = {} if fmt == 'matlab-stacked': X_pad = pad_sequences(dataset.X) matlab_data['X'] = X_pad matlab_data['T'] = X_pad.shape[1] else: matlab_data['X'] = dataset.X matlab_data['y'] = dataset.y matlab_data['unique_labels'] = dataset.unique_labels matlab_data['feature_names'] = dataset.feature_names matlab_data['feature_lengths'] = dataset.feature_lengths scipy.io.savemat(output, matlab_data) elif fmt == 'pickle': with open(output, 'wb') as f: pickle.dump(dataset, f) print('done!')
def evaluate_decision_maker(decision_maker, train, test, args): train_X, train_y = train test_X, test_y = test transformers = data.transformers_from_args(args) for transformer in transformers: if hasattr(transformer, "fit") and callable(transformer.fit): transformer.fit(train_X) train_X = transformer.transform(train_X) test_X = transformer.transform(test_X) if hasattr(decision_maker, "fit") and callable(decision_maker.fit): decision_maker.fit(train_X, train_y) test_pred = decision_maker.predict(test_X) return test_pred
def evaluate_decision_maker(decision_maker, train, test, args): train_X, train_y = train test_X, test_y = test transformers = data.transformers_from_args(args) for transformer in transformers: if hasattr(transformer, 'fit') and callable(transformer.fit): transformer.fit(train_X) train_X = transformer.transform(train_X) test_X = transformer.transform(test_X) if hasattr(decision_maker, 'fit') and callable(decision_maker.fit): decision_maker.fit(train_X, train_y) test_pred = decision_maker.predict(test_X) return test_pred
def plot(dataset, args): transformers = data.transformers_from_args(args) if len(transformers) > 0: dataset = dataset.dataset_from_transformers(transformers) plot_labels = args.plot_labels for features in dataset.X: n_samples, n_features = features.shape x = np.arange(n_samples) for feature_idx in xrange(n_features): feature = features[:, feature_idx] plt.plot(x, feature) if plot_labels is not None: if len(plot_labels) == n_features: plt.legend(plot_labels, loc='upper left') else: logging.warn('plot-labels must have length %d' % n_features) plt.xlabel('time steps') plt.ylabel('feature value') plt.show()
def main(args): start_total = timeit.default_timer() if args.verbose: logging.basicConfig(level=logging.DEBUG) # Validate that paths exist so that we don't need to check that whenever we use it if not os.path.exists(args.dataset): exit('data set at path "%s" does not exist' % args.dataset) # Print command again to make it easier to re-produce later from the logs print('python ' + ' '.join(sys.argv)) print('') print('args:') print(args) print('') # Load dataset print('loading data set "%s" ...' % args.dataset) start = timeit.default_timer() with open(args.dataset, 'rb') as f: dataset = pickle.load(f) if type(dataset) != data.Dataset: raise ValueError('invalid dataset') print('done, took %fs' % (timeit.default_timer() - start)) if args.features is not None and args.features != dataset.feature_names: print('selecting features ...') features = args.features start = timeit.default_timer() dataset = dataset.dataset_from_feature_names(features) print('done, took %fs' % (timeit.default_timer() - start)) print('') # Print overview print('dataset overview:') print(' samples: %d' % dataset.n_samples) print(' labels: %s' % ', '.join(dataset.unique_labels)) print(' features: %s' % ', '.join(dataset.feature_names)) print('') transformers = data.transformers_from_args(args) dataset = dataset.dataset_from_transformers(transformers) model = GaussianHMM() model.n_training_iterations = 10 model.n_states = 6 model.topology = 'left-to-right-1' model.verbose = args.verbose model.transition_init = 'uniform' model.emission_init = 'k-means' model.covar_type = 'diag' classifier = HMMClassifier(model, n_jobs=args.n_jobs) print('training classifier ...') start = timeit.default_timer() classifier.fit(dataset.X, dataset.y) print('done, took %fs' % (timeit.default_timer() - start)) total_scores = np.zeros(len(dataset.feature_names)) for idx, model in enumerate(classifier.models_): label_name = dataset.unique_labels[idx] print('important features for %s:' % label_name) mean_covar = np.mean(model.model_.covars_, axis=0) # Reduce to a single score per feature scores = np.zeros(len(dataset.feature_names)) start_idx = 0 for feature_idx, length in enumerate(dataset.feature_lengths): end_idx = start_idx + length print('from %d to %d' % (start_idx, end_idx)) scores[feature_idx] = np.mean(mean_covar[start_idx:end_idx]) start_idx += length total_scores += scores # # sorted_exploded_feature_names = exploded_feature_names[sorted_features_indexes] # sorted_feature_names = [] # feature_scores = {} # for name_idx, exploded_name in enumerate(sorted_exploded_feature_names): # name = exploded_name.split('*')[0] # if name not in sorted_feature_names: # sorted_feature_names.append(name) # feature_scores[name] = 0 # feature_scores[name] += name_idx # for name, length in zip(dataset.feature_names, dataset.feature_lengths): # feature_scores[name] /= length # print np.array(feature_scores.keys())[np.argsort(feature_scores.values())] # print('') # # if total_feature_scores is None: # total_feature_scores = feature_scores # else: # for k, v in feature_scores.iteritems(): # total_feature_scores[k] += v total_scores /= dataset.n_labels print('') print('total scores:') sorted_indexes = np.argsort(total_scores) sorted_names = np.array(dataset.feature_names)[sorted_indexes] sorted_scores = total_scores[sorted_indexes] for name, score in zip(sorted_names, sorted_scores): print('%s: %f' % (name, score))
def _compute_averaged_pos_and_neg_lls(dataset, iterator, prefix, args, save_model=False, compute_distances=False): combined_lls = [] combined_ys = [] rounds = list(iterator) distances = [] if args.measure == 'hmm-distance': compute_distances = True for rnd, (train_indexes, test_indexes) in enumerate(rounds): transformers = data.transformers_from_args(args) train, test = dataset.split_train_test(train_indexes, test_indexes, transformers) # Train classifier and save model classifier = get_classifier(args) classifier.fit(train.X, train.y) if save_model and args.output_dir is not None: filename = '%s_rnd%d_model.pkl' % (prefix, rnd + 1) with open(os.path.join(args.output_dir, filename), 'wb') as f: pickle.dump(classifier, f) # Calculate distances. I have no idea why, but having n_jobs > 1 is causing a deadlock. Again, I have no clue # how/why this could happen, hence this workaround. if compute_distances: old_jobs = args.n_jobs args.n_jobs = 1 distances.append(classifier.distances(200, loglikelihood_method=args.loglikelihood_method)) args.n_jobs = old_jobs else: d = np.zeros((dataset.n_labels, dataset.n_labels)) distances.append(d) # Calculate likelihoods UNDER THE TEST SET (!!!). test_lls = classifier.loglikelihoods(test.X, method=args.loglikelihood_method) combined_lls.append(test_lls) combined_ys.append(test.y) combined_lls = np.vstack(combined_lls) combined_ys = np.vstack(combined_ys) assert combined_lls.shape == combined_ys.shape n_samples, n_labels = combined_lls.shape pos_ll_means = [] pos_ll_stds = [] neg_ll_means = [] neg_ll_stds = [] for label_idx in xrange(n_labels): label_lls = combined_lls[:, label_idx] curr_y = combined_ys[:, label_idx] pos_label_lls = label_lls[curr_y == 1] neg_label_lls = label_lls[curr_y == 0] assert np.size(pos_label_lls) + np.size(neg_label_lls) == n_samples pos_ll_means.append(np.mean(pos_label_lls)) pos_ll_stds.append(np.std(pos_label_lls)) neg_ll_means.append(np.mean(neg_label_lls)) neg_ll_stds.append(np.std(neg_label_lls)) pos_ll_means = np.array(pos_ll_means) pos_ll_stds = np.array(pos_ll_stds) neg_ll_means = np.array(neg_ll_means) neg_ll_stds = np.array(neg_ll_stds) assert pos_ll_means.shape == neg_ll_means.shape assert pos_ll_stds.shape == neg_ll_stds.shape assert pos_ll_means.shape == pos_ll_stds.shape assert pos_ll_means.shape == (dataset.n_labels,) # Calculate averaged distances averaged_distances = np.mean(np.array(distances), axis=0) assert averaged_distances.shape == (dataset.n_labels, dataset.n_labels) # Save likelihoods and distances if args.output_dir is not None: save_results(args.output_dir, combined_ys, None, combined_lls, prefix) if compute_distances: np.savetxt(os.path.join(args.output_dir, '%s_distances.csv' % prefix), averaged_distances, delimiter=';', fmt='%f') return {'pos_ll_means': pos_ll_means, 'pos_ll_stds': pos_ll_stds, 'neg_ll_means': neg_ll_means, 'neg_ll_stds': neg_ll_stds, 'distance_means': averaged_distances, 'combined_lls': combined_lls, 'combined_ys': combined_ys}
def _evaluate_model(dataset, iterator, args, print_results=False): loglikelihood_method = args.loglikelihood_method # Collect stats train_loglikelihoods = [] train_predictions = [] train_labels = [] test_loglikelihoods = [] test_predictions = [] test_labels = [] for rnd, (train_indexes, test_indexes) in enumerate(iterator): assert len(set(train_indexes).intersection(set(test_indexes))) == 0 transformers = data.transformers_from_args(args) train, test = dataset.split_train_test(train_indexes, test_indexes, transformers) assert train.n_samples == len(train_indexes) assert test.n_samples == len(test_indexes) train_labels.append(train.y) test_labels.append(test.y) classifier = get_classifier(args) if print_results: print('evaluation round %d' % (rnd + 1)) print(' train split: %s' % train_indexes) print(' test split: %s' % test_indexes) print(' training classifier on training samples ...') start = timeit.default_timer() classifier.fit(train.X, train.y) stop = timeit.default_timer() if args.output_dir is not None: name = 'rnd%d_model.pkl' % (rnd+1) with open(os.path.join(args.output_dir, name), 'wb') as f: pickle.dump(classifier, f) if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' computing %s loglikelihoods on train dataset ...' % loglikelihood_method) start = timeit.default_timer() train_ll = classifier.loglikelihoods(train.X, method=loglikelihood_method) train_loglikelihoods.append(train_ll) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' computing %s loglikelihoods on test dataset ...' % loglikelihood_method) start = timeit.default_timer() test_ll = classifier.loglikelihoods(test.X, method=loglikelihood_method) test_loglikelihoods.append(test_ll) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) decision_maker = decision.decision_maker_from_args(args) train_pred, test_pred = None, None if decision_maker is not None: if hasattr(decision_maker, 'fit') and callable(decision_maker.fit): if print_results: print(' training decision maker %s on train loglikelihoods ...' % args.decision_maker) start = timeit.default_timer() decision_maker.fit(train_ll, train.y) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' predicting labels on train dataset ...') start = timeit.default_timer() train_pred = decision_maker.predict(train_ll) train_predictions.append(train_pred) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' predicting labels on test dataset ...') start = timeit.default_timer() test_pred = decision_maker.predict(test_ll) test_predictions.append(test_pred) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print('') # Save round results if args.output_dir is not None: save_results(args.output_dir, train.y, train_pred, train_ll, prefix='rnd%d_train' % (rnd+1)) save_results(args.output_dir, test.y, test_pred, test_ll, prefix='rnd%d_test' % (rnd+1)) # Combine and save combined results train_y_combined = np.vstack(train_labels) train_ll_combined = np.vstack(train_loglikelihoods) train_pred_combined = np.vstack(train_predictions) if len(train_predictions) > 0 else None test_ll_combined = np.vstack(test_loglikelihoods) test_y_combined = np.vstack(test_labels) test_pred_combined = np.vstack(test_predictions) if len(test_predictions) > 0 else None if args.output_dir is not None: save_results(args.output_dir, train_y_combined, train_pred_combined, train_ll_combined, 'combined_train') save_results(args.output_dir, test_y_combined, test_pred_combined, test_ll_combined, 'combined_test') if print_results: # Print report label_names = dataset.unique_labels print('*** train dataset summary ***') print('') print(metrics.multilabel_loglikelihood_summary_report(train_y_combined, train_ll_combined, target_names=label_names)) print('') if train_pred_combined is not None: print(metrics.multilabel_classification_report(train_y_combined, train_pred_combined, target_names=label_names)) print('total accuracy: %.3f' % sk_metrics.accuracy_score(train_y_combined, train_pred_combined)) print('') print('') print('*** test dataset summary ***') print('') print(metrics.multilabel_loglikelihood_summary_report(test_y_combined, test_ll_combined, target_names=label_names)) print('') if test_pred_combined is not None: print(metrics.multilabel_classification_report(test_y_combined, test_pred_combined, target_names=label_names)) print('total accuracy: %.3f' % sk_metrics.accuracy_score(test_y_combined, test_pred_combined)) print('') return train_loglikelihoods, train_labels, test_loglikelihoods, test_labels
def _compute_averaged_pos_and_neg_lls(dataset, iterator, prefix, args, save_model=False, compute_distances=False): combined_lls = [] combined_ys = [] rounds = list(iterator) distances = [] if args.measure == 'hmm-distance': compute_distances = True for rnd, (train_indexes, test_indexes) in enumerate(rounds): transformers = data.transformers_from_args(args) train, test = dataset.split_train_test(train_indexes, test_indexes, transformers) # Train classifier and save model classifier = get_classifier(args) classifier.fit(train.X, train.y) if save_model and args.output_dir is not None: filename = '%s_rnd%d_model.pkl' % (prefix, rnd + 1) with open(os.path.join(args.output_dir, filename), 'wb') as f: pickle.dump(classifier, f) # Calculate distances. I have no idea why, but having n_jobs > 1 is causing a deadlock. Again, I have no clue # how/why this could happen, hence this workaround. if compute_distances: old_jobs = args.n_jobs args.n_jobs = 1 distances.append( classifier.distances( 200, loglikelihood_method=args.loglikelihood_method)) args.n_jobs = old_jobs else: d = np.zeros((dataset.n_labels, dataset.n_labels)) distances.append(d) # Calculate likelihoods UNDER THE TEST SET (!!!). test_lls = classifier.loglikelihoods(test.X, method=args.loglikelihood_method) combined_lls.append(test_lls) combined_ys.append(test.y) combined_lls = np.vstack(combined_lls) combined_ys = np.vstack(combined_ys) assert combined_lls.shape == combined_ys.shape n_samples, n_labels = combined_lls.shape pos_ll_means = [] pos_ll_stds = [] neg_ll_means = [] neg_ll_stds = [] for label_idx in xrange(n_labels): label_lls = combined_lls[:, label_idx] curr_y = combined_ys[:, label_idx] pos_label_lls = label_lls[curr_y == 1] neg_label_lls = label_lls[curr_y == 0] assert np.size(pos_label_lls) + np.size(neg_label_lls) == n_samples pos_ll_means.append(np.mean(pos_label_lls)) pos_ll_stds.append(np.std(pos_label_lls)) neg_ll_means.append(np.mean(neg_label_lls)) neg_ll_stds.append(np.std(neg_label_lls)) pos_ll_means = np.array(pos_ll_means) pos_ll_stds = np.array(pos_ll_stds) neg_ll_means = np.array(neg_ll_means) neg_ll_stds = np.array(neg_ll_stds) assert pos_ll_means.shape == neg_ll_means.shape assert pos_ll_stds.shape == neg_ll_stds.shape assert pos_ll_means.shape == pos_ll_stds.shape assert pos_ll_means.shape == (dataset.n_labels, ) # Calculate averaged distances averaged_distances = np.mean(np.array(distances), axis=0) assert averaged_distances.shape == (dataset.n_labels, dataset.n_labels) # Save likelihoods and distances if args.output_dir is not None: save_results(args.output_dir, combined_ys, None, combined_lls, prefix) if compute_distances: np.savetxt(os.path.join(args.output_dir, '%s_distances.csv' % prefix), averaged_distances, delimiter=';', fmt='%f') return { 'pos_ll_means': pos_ll_means, 'pos_ll_stds': pos_ll_stds, 'neg_ll_means': neg_ll_means, 'neg_ll_stds': neg_ll_stds, 'distance_means': averaged_distances, 'combined_lls': combined_lls, 'combined_ys': combined_ys }
def _evaluate_model(dataset, iterator, args, print_results=False): loglikelihood_method = args.loglikelihood_method # Collect stats train_loglikelihoods = [] train_predictions = [] train_labels = [] test_loglikelihoods = [] test_predictions = [] test_labels = [] for rnd, (train_indexes, test_indexes) in enumerate(iterator): assert len(set(train_indexes).intersection(set(test_indexes))) == 0 transformers = data.transformers_from_args(args) train, test = dataset.split_train_test(train_indexes, test_indexes, transformers) assert train.n_samples == len(train_indexes) assert test.n_samples == len(test_indexes) train_labels.append(train.y) test_labels.append(test.y) classifier = get_classifier(args) if print_results: print('evaluation round %d' % (rnd + 1)) print(' train split: %s' % train_indexes) print(' test split: %s' % test_indexes) print(' training classifier on training samples ...') start = timeit.default_timer() classifier.fit(train.X, train.y) stop = timeit.default_timer() if args.output_dir is not None: name = 'rnd%d_model.pkl' % (rnd + 1) with open(os.path.join(args.output_dir, name), 'wb') as f: pickle.dump(classifier, f) if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' computing %s loglikelihoods on train dataset ...' % loglikelihood_method) start = timeit.default_timer() train_ll = classifier.loglikelihoods(train.X, method=loglikelihood_method) train_loglikelihoods.append(train_ll) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' computing %s loglikelihoods on test dataset ...' % loglikelihood_method) start = timeit.default_timer() test_ll = classifier.loglikelihoods(test.X, method=loglikelihood_method) test_loglikelihoods.append(test_ll) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) decision_maker = decision.decision_maker_from_args(args) train_pred, test_pred = None, None if decision_maker is not None: if hasattr(decision_maker, 'fit') and callable(decision_maker.fit): if print_results: print( ' training decision maker %s on train loglikelihoods ...' % args.decision_maker) start = timeit.default_timer() decision_maker.fit(train_ll, train.y) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' predicting labels on train dataset ...') start = timeit.default_timer() train_pred = decision_maker.predict(train_ll) train_predictions.append(train_pred) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print(' predicting labels on test dataset ...') start = timeit.default_timer() test_pred = decision_maker.predict(test_ll) test_predictions.append(test_pred) stop = timeit.default_timer() if print_results: print(' done, took %fs' % (stop - start)) if print_results: print('') # Save round results if args.output_dir is not None: save_results(args.output_dir, train.y, train_pred, train_ll, prefix='rnd%d_train' % (rnd + 1)) save_results(args.output_dir, test.y, test_pred, test_ll, prefix='rnd%d_test' % (rnd + 1)) # Combine and save combined results train_y_combined = np.vstack(train_labels) train_ll_combined = np.vstack(train_loglikelihoods) train_pred_combined = np.vstack( train_predictions) if len(train_predictions) > 0 else None test_ll_combined = np.vstack(test_loglikelihoods) test_y_combined = np.vstack(test_labels) test_pred_combined = np.vstack( test_predictions) if len(test_predictions) > 0 else None if args.output_dir is not None: save_results(args.output_dir, train_y_combined, train_pred_combined, train_ll_combined, 'combined_train') save_results(args.output_dir, test_y_combined, test_pred_combined, test_ll_combined, 'combined_test') if print_results: # Print report label_names = dataset.unique_labels print('*** train dataset summary ***') print('') print( metrics.multilabel_loglikelihood_summary_report( train_y_combined, train_ll_combined, target_names=label_names)) print('') if train_pred_combined is not None: print( metrics.multilabel_classification_report( train_y_combined, train_pred_combined, target_names=label_names)) print( 'total accuracy: %.3f' % sk_metrics.accuracy_score( train_y_combined, train_pred_combined)) print('') print('') print('*** test dataset summary ***') print('') print( metrics.multilabel_loglikelihood_summary_report( test_y_combined, test_ll_combined, target_names=label_names)) print('') if test_pred_combined is not None: print( metrics.multilabel_classification_report( test_y_combined, test_pred_combined, target_names=label_names)) print( 'total accuracy: %.3f' % sk_metrics.accuracy_score(test_y_combined, test_pred_combined)) print('') return train_loglikelihoods, train_labels, test_loglikelihoods, test_labels