def new_dataset_with_same_ids_classes(in_ds): feat_dim = np.random.randint(1, max_feat_dim) out_ds = MLDataset() for id_ in in_ds.keys: out_ds.add_sample(id_, np.random.rand(feat_dim), class_id=in_ds.classes[id_], label=in_ds.labels[id_]) return out_ds
def make_fully_separable_classes(max_class_size = 10, max_dim = 22): from sklearn.datasets import make_blobs random_center = np.random.rand(max_dim) cluster_std = 1.5 centers = [random_center, random_center+cluster_std*6] blobs_X, blobs_y = make_blobs(n_samples=max_class_size, n_features=max_dim, centers=centers, cluster_std=cluster_std) unique_labels = np.unique(blobs_y) class_ids = { lbl : str(lbl) for lbl in unique_labels } new_ds = MLDataset() for index, row in enumerate(blobs_X): new_ds.add_sample('sub{}'.format(index), row, label=blobs_y[index], class_id=class_ids[blobs_y[index]]) return new_ds
def test_eq_copy(): new_copy = MLDataset(in_dataset=copy_dataset) assert new_copy == copy_dataset
def test_init_with_dict(): new_ds = MLDataset(data=test_dataset.data, labels=test_dataset.labels, classes=test_dataset.classes) assert new_ds == test_dataset
def test_cant_read_nonexisting_file(): with raises(IOError): a = MLDataset('/nonexistentrandomdir/disofddlsfj/arbitrary.noname.pkl')
from pytest import raises, warns from pyradigm import MLDataset out_dir = '.' num_classes = np.random.randint(2, 50) class_sizes = np.random.randint(10, 100, num_classes) num_features = np.random.randint(10, 100) num_samples = sum(class_sizes) class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([str(x) for x in range(num_features)]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir, 'random_example_dataset.pkl') test_dataset.save(out_file) # same IDs, new features same_ids_new_feat = MLDataset() for sub_id in test_dataset.keys: feat = np.random.random(num_features) same_ids_new_feat.add_sample(sub_id, feat, test_dataset.labels[sub_id],
def import_datasets(method_list, out_dir, subjects, classes, feature_path, feature_type='dir_of_dirs', user_impute_strategy=cfg.default_imputation_strategy): """ Imports all the specified feature sets and organizes them into datasets. Parameters ---------- method_list : list of callables Set of predefined methods returning a vector of features for a given sample id and location out_dir : str Path to the output folder subjects : list of str List of sample ids classes : dict Dict identifying the class for each sample id in the dataset. feature_path : list of str List of paths to the root directory containing the features (pre- or user-defined). Must be of same length as method_list feature_type : str a string identifying the structure of feature set. Choices = ('dir_of_dirs', 'data_matrix') user_impute_strategy : str Strategy to handle the missing data: whether to raise an error if data is missing, or to impute them using the method chosen here. Returns ------- method_names : list of str List of method names used for annotation dataset_paths_file : str Path to the file containing paths to imported feature sets missing_data_flag : list List of boolean flags indicating whether data is missing in each of the input datasets. """ def clean_str(string): return ' '.join(string.strip().split(' _-:\n\r\t')) from neuropredict.io import process_pyradigm, process_arff method_names = list() outpath_list = list() missing_data_flag = list() # boolean flag for each dataset for mm, cur_method in enumerate(method_list): if cur_method in [get_pyradigm]: method_name, out_path_cur_dataset = process_pyradigm(feature_path[mm], subjects, classes) # if feature_type in ['pyradigm']: # loaded_dataset = MLDataset(filepath=feature_path[mm]) # else: # raise ValueError('Invalid state of the program!') # # if len(loaded_dataset.description) > 1: # method_name = loaded_dataset.description # else: # method_name = basename(feature_path[mm]) # # method_names.append(clean_str(method_name)) # if not saved_dataset_matches(loaded_dataset, subjects, classes): # raise ValueError( # 'supplied pyradigm dataset does not match samples in the meta data.') # else: # out_path_cur_dataset = feature_path[mm] elif cur_method in [get_arff]: method_name, out_path_cur_dataset = process_arff(feature_path[mm], subjects, classes, out_dir) # loaded_dataset = MLDataset(arff_path=feature_path[mm]) # if len(loaded_dataset.description) > 1: # method_name = loaded_dataset.description # else: # method_name = basename(feature_path[mm]) # # method_names.append(clean_str(method_name)) # out_name = make_dataset_filename(method_name) # out_path_cur_dataset = pjoin(out_dir, out_name) # loaded_dataset.save(out_path_cur_dataset) else: if cur_method in [get_dir_of_dirs]: method_name = basename(feature_path[mm]) elif cur_method in [get_data_matrix]: method_name = os.path.splitext(basename(feature_path[mm]))[0] else: method_name = cur_method.__name__ out_name = make_dataset_filename(method_name) out_path_cur_dataset = pjoin(out_dir, out_name) if not saved_dataset_matches(out_path_cur_dataset, subjects, classes): # noinspection PyTypeChecker out_path_cur_dataset = get_features(subjects, classes, feature_path[mm], out_dir, out_name, cur_method, feature_type) # checking for presence of any missing data data_mat, targets, ids = MLDataset(filepath=out_path_cur_dataset).data_and_labels() is_nan = np.isnan(data_mat) if is_nan.any(): data_missing_here = True num_sub_with_md = np.sum(is_nan.sum(axis=1) > 0) num_var_with_md = np.sum(is_nan.sum(axis=0) > 0) if user_impute_strategy == 'raise': raise MissingDataException( '{}/{} subjects with missing data found in {}/{} features\n' '\tin {} dataset at {}\n' '\tFill them and rerun, ' 'or choose one of the available imputation strategies: {}' ''.format(num_sub_with_md, data_mat.shape[0], num_var_with_md, data_mat.shape[1], method_name, out_path_cur_dataset, cfg.avail_imputation_strategies)) else: data_missing_here = False method_names.append(clean_str(method_name)) outpath_list.append(out_path_cur_dataset) missing_data_flag.append(data_missing_here) # finalizing the imputation strategy if any(missing_data_flag): print('\nOne or more of the input datasets have missing data!') if user_impute_strategy == 'raise': raise MissingDataException('Fill them and rerun, or choose one of the available ' 'imputation strategies: {}' ''.format(cfg.avail_imputation_strategies)) else: impute_strategy = user_impute_strategy print('The imputation strategy chosen is: {}'.format(impute_strategy)) else: # disabling the imputation altogether if there is no missing data impute_strategy = None if user_impute_strategy in ('raise', None): print('Ignoring imputation strategy chosen, as no missing data were found!') combined_name = uniq_combined_name(method_names) # checking if there are any duplicates if len(set(outpath_list)) < len(outpath_list): raise RuntimeError('Duplicate paths to input dataset found!\n' 'Try distinguish inputs further. Otherwise report this bug ' '@ github.com/raamana/neuropredict/issues/new') dataset_paths_file = pjoin(out_dir, 'datasetlist.' + combined_name + '.txt') with open(dataset_paths_file, 'w') as dpf: dpf.writelines('\n'.join(outpath_list)) print('\nData import is done.\n\n') return method_names, dataset_paths_file, missing_data_flag, impute_strategy
for base_feature in base_feature_list: print(' Processing {}'.format(base_feature)) incomplete_processing[base_feature] = dict() comb_nan_values[base_feature] = dict() for stat_method in roi_stat_list: print('Gathering data for {}'.format(stat_method)) expt_id = '{}_{}_{}_smoothing{}_size{}'.format(stat_method, base_feature, atlas, fwhm, node_size) flag_nan_exists = False flag_incomplete = False flag_unexpected = False dataset = MLDataset() incomplete_processing[base_feature][stat_method] = dict() comb_nan_values[base_feature][stat_method] = dict() for ds_name in dataset_list: print(' working on {}'.format(ds_name)) proc_dir = pjoin(base_dir, ds_name, 'processed') out_dir = pjoin(proc_dir, 'graynet', '{}_{}_fwhm{}'.format(base_feature, atlas, fwhm)) meta_list = pjoin(proc_dir, 'target_lists', 'meta_{}.csv'.format(ds_name)) sample_ids, classes = run_workflow.get_metadata(meta_list) incomplete_processing[base_feature][stat_method][ds_name] = list() comb_nan_values[base_feature][stat_method][ds_name] = list()
from pyradigm import MLDataset except: raise ImportError('could not import pyradigm') else: raise NotImplementedError('pyradigm supports only 2.7.13 or 3+. Upgrade to Python 3+ is recommended.') out_dir = '.' num_classes = np.random.randint( 2, 50) class_sizes = np.random.randint(10, 100, num_classes) num_features = np.random.randint(10, 100) class_set = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([ str(x) for x in range(num_features) ]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir,'random_example_dataset.pkl') test_dataset.save(out_file) # same IDs, new features same_ids_new_feat = MLDataset() for sub_id in test_dataset.keys: feat = np.random.random(num_features) same_ids_new_feat.add_sample(sub_id, feat, test_dataset.labels[sub_id],
def test_unpickling(): out_file = os.path.join(out_dir, 'random_pickled_dataset.pkl') copy_dataset.save(out_file) reloaded_dataset = MLDataset(filepath=out_file, description='reloaded test_dataset') assert copy_dataset == reloaded_dataset
sys.dont_write_bytecode = True from pytest import raises, warns, set_trace from pyradigm import MLDataset out_dir = '.' num_classes = np.random.randint(2, 50) class_sizes = np.random.randint(10, 1000, num_classes) num_features = np.random.randint(10, 500) class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([str(x) for x in range(num_features)]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir, 'random_example_dataset.pkl') test_dataset.save(out_file) class_set, label_set, class_sizes = test_dataset.summarize_classes() reloaded_dataset = MLDataset(filepath=out_file, description='reloaded test_dataset')
def run(dataset_path_file, method_names, out_results_dir, train_perc = 0.8, num_repetitions = 200, positive_class = None): """ Parameters ---------- dataset_path_file : str path to file containing list of paths (each containing a valid MLDataset). method_names : list A list of names to denote the different feature extraction methods out_results_dir : str Path to output directory to save the cross validation results to. train_perc : float, optional Percetange of subjects to train the classifier on. The percentage is applied to the size of the smallest class to estimate the number of subjects from each class to be reserved for training. The smallest class is chosen to avoid class-imbalance in the training set. Default: 0.8 (80%). num_repetitions : int, optional Number of repetitions of cross-validation estimation. Default: 200. positive_class : str Name of the class to be treated as positive in calculation of AUC Returns ------- results_path : str Path to pickle file containing full set of CV results. """ # structure of this function # load datasets # validate each dataset # ensure same number of subjects across all datasets # same number of features/subject in each dataset # same class set across all datasets # re-map the labels (from 1 to n) to ensure numeric labels do not differ # sort them if need be (not needed if MLDatasets) # for rep 1 to N, for feat 1 to M, # run train/test/evaluate. # keep tab on misclassifications # save results (comprehensive and reloadable manner) assert os.path.exists(dataset_path_file), "File containing dataset paths does not exist." with open(dataset_path_file, 'r') as dpf: dataset_paths = dpf.read().splitlines() try: out_results_dir = os.path.abspath(out_results_dir) if not os.path.exists(out_results_dir): os.mkdir(out_results_dir) except: raise IOError('Error in checking or creating output directiory. Ensure write permissions!') num_repetitions = int(num_repetitions) assert num_repetitions < np.Inf, "Infinite number of repetitions is not recommened!" assert num_repetitions > 1, "More than repetition is necessary!" # TODO warning when num_rep are not suficient: need a heuristic to assess it # loading datasets datasets = list() for fp in dataset_paths: assert os.path.exists(fp), "Dataset @ {} does not exist.".format(fp) try: # there is an internal validation of dataset ds = MLDataset(fp) except: print("Dataset @ {} is not a valid MLDataset!".format(fp)) raise # add the valid dataset to list datasets.append(ds) # ensure same number of subjects across all datasets num_datasets = int(len(datasets)) # looking into the first dataset common_ds = datasets[0] class_set, label_set, class_sizes = common_ds.summarize_classes() num_samples = common_ds.num_samples num_classes = len(class_set) if num_datasets > 1: for idx in range(1, num_datasets): this_ds = datasets[idx] assert num_samples==this_ds.num_samples, "Number of samples in different datasets differ!" assert set(class_set)==set(this_ds.classes.values()), \ "Classes differ among datasets! \n One dataset: {} \n Another: {}".format( set(class_set), set(this_ds.classes.values())) # re-map the labels (from 1 to n) to ensure numeric labels do not differ remapped_class_labels = dict() for idx, cls in enumerate(class_set): remapped_class_labels[cls] = idx # finding the numeric label for positive class # label will also be in the index into the arrays over classes due to construction above if num_classes == 2: if positive_class is None: positive_class = class_set[-1] # List.index(item) returns the first index of a match pos_class_index = class_set.index(positive_class) # remapped_class_labels[positive_class] labels_with_correspondence = dict() for subid in common_ds.sample_ids: labels_with_correspondence[subid] = remapped_class_labels[common_ds.classes[subid]] for idx in range(num_datasets): datasets[idx].labels = labels_with_correspondence assert (train_perc >= 0.01 and train_perc <= 0.99), \ "Training percentage {} out of bounds - must be > 0.01 and < 0.99".format(train_perc) num_features = np.zeros(num_datasets).astype(np.int64) for idx in range(num_datasets): num_features[idx] = datasets[idx].num_features # determine the common size for training print("Different classes in the training set are stratified to match the smallest class!") train_size_per_class = np.int64(np.floor(train_perc*class_sizes).astype(np.float64)) # per-class train_size_common = np.int64(np.minimum(min(train_size_per_class), train_size_per_class)) # single number reduced_sizes = np.unique(train_size_common) assert len(reduced_sizes)==1, "Error in stratification of training set based on the smallest class!" train_size_common = reduced_sizes[0] total_test_samples = np.int64(np.sum(class_sizes) - num_classes*train_size_common) pred_prob_per_class = np.full([num_repetitions, num_datasets, total_test_samples, num_classes], np.nan) pred_labels_per_rep_fs = np.full([num_repetitions, num_datasets, total_test_samples], np.nan) test_labels_per_rep = np.full([num_repetitions, total_test_samples], np.nan) best_min_leaf_size = np.full([num_repetitions, num_datasets], np.nan) best_num_predictors = np.full([num_repetitions, num_datasets], np.nan) # initialize misclassification counters num_times_tested = list() num_times_misclfd= list() for dd in range(num_datasets): num_times_tested.append(Counter(common_ds.sample_ids)) num_times_misclfd.append(Counter(common_ds.sample_ids)) for subid in common_ds.sample_ids: num_times_tested[dd][subid] = 0 num_times_misclfd[dd][subid]= 0 # multi-class metrics confusion_matrix = np.full([num_classes, num_classes, num_repetitions, num_datasets], np.nan) accuracy_balanced = np.full([num_repetitions, num_datasets], np.nan) auc_weighted = np.full([num_repetitions, num_datasets], np.nan) # # specificity & sensitivity are ill-defined in the general case as they require us to know which class is positive # # hence would refer them from now on simply correct classification rates (ccr) # moreover this can be easily computed from the confusion matrix anyway. # ccr_perclass = np.full([num_repetitions, num_datasets, num_classes], np.nan) # binary metrics # TODO later when are the uses of precision and recall appropriate? # precision = np.full([num_repetitions, num_datasets], np.nan) # recall = np.full([num_repetitions, num_datasets], np.nan) feature_names = [None]*num_datasets feature_importances_rf = [None]*num_datasets for idx in range(num_datasets): feature_importances_rf[idx] = np.full([num_repetitions,num_features[idx]], np.nan) feature_names[idx] = datasets[idx].feature_names # repeated-hold out CV begins here # TODO LATER implement a multi-process version as differnt rep's are embarrasingly parallel # use the following one statement processing that can be forked to parallel threads # pred_prob_per_class[rep, dd, :, :], pred_labels_per_rep_fs[rep, dd, :], \ # confmat, misclsfd_ids_this_run, feature_importances_rf[dd][rep, :], \ # best_min_leaf_size[rep, dd], best_num_predictors[rep, dd] \ # = holdout_evaluation(datasets, train_size_common, total_test_samples) max_width_method_names = max(map(len, method_names)) for rep in range(num_repetitions): print("\n CV repetition {:3d} ".format(rep)) # TODO to achieve feature- or method-level parallization, # train/test splits need to be saved at the entry level for each subgroup and used here train_set, test_set = common_ds.train_test_split_ids(count_per_class=train_size_common) test_labels_per_rep[rep, :] = [ common_ds.labels[sid] for sid in test_set if sid in common_ds.labels] # evaluating each feature/dataset # try set test_labels_per_rep outside dd loop as its the same across all dd for dd in range(num_datasets): # print("\t feature {:3d} {:>{}}: ".format(dd, method_names[dd], max_width_method_names), end='') print("\t feature {index:3d} {name:>{namewidth}} : ".format(index=dd, name=method_names[dd], namewidth=max_width_method_names), end='') train_fs = datasets[dd].get_subset(train_set) test_fs = datasets[dd].get_subset(test_set) pred_prob_per_class[rep, dd, :, :], \ pred_labels_per_rep_fs[rep, dd, :], true_test_labels, \ confmat, misclsfd_ids_this_run, feature_importances_rf[dd][rep,:], \ best_min_leaf_size[rep, dd], best_num_predictors[rep, dd] = \ eval_optimized_clsfr_on_testset(train_fs, test_fs, label_order_in_CM=label_set) accuracy_balanced[rep,dd] = balanced_accuracy(confmat) confusion_matrix[:,:,rep,dd] = confmat print('balanced accuracy: {:.4f} '.format(accuracy_balanced[rep, dd]), end='') if num_classes == 2: # TODO FIX auc calculation flipped # TODO store fpr and tpr per rep, and provide the user to option to vizualize the average if they wish auc_weighted[rep,dd] = roc_auc_score(true_test_labels, pred_prob_per_class[rep, dd, :, pos_class_index], average='weighted') print('\t weighted AUC: {:.4f}'.format(auc_weighted[rep,dd]), end='') num_times_misclfd[dd].update(misclsfd_ids_this_run) num_times_tested[dd].update(test_fs.sample_ids) print('') # save results var_list_to_save = [dataset_paths, method_names, train_perc, num_repetitions, num_classes, pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, best_min_leaf_size, best_num_predictors, feature_importances_rf, feature_names, num_times_misclfd, num_times_tested, confusion_matrix, class_set, accuracy_balanced, auc_weighted, positive_class ] var_names_to_save = ['dataset_paths', 'method_names', 'train_perc', 'num_repetitions', 'num_classes', 'pred_prob_per_class', 'pred_labels_per_rep_fs', 'test_labels_per_rep', 'best_min_leaf_size', 'best_num_predictors', 'feature_importances_rf', 'feature_names', 'num_times_misclfd', 'num_times_tested', 'confusion_matrix', 'class_set', 'accuracy_balanced', 'auc_weighted', 'positive_class' ] locals_var_dict = locals() dict_to_save = {var : locals_var_dict[var] for var in cfg.rhst_data_variables_to_persist} out_results_path = save_results(out_results_dir, dict_to_save) return out_results_path